Improve examples & related tests (#7773)

* [WiP] improve load_examples related to #7472, longer term we will generate the examples by exporting them into tarball as in #7472. In the meantime, we need this subset of the features: * allowing specifying an alternate database connection for examples * allowing a --only-metadata flag to `load_examples` to load only dashboard and chart definitions, no actual data is loaded * Improve logging * Rename data->examples * Load only if not exist * By default do not load, add a force flag * fix build * set published to true
apache · Jul 17, 2019 · d65b039 · d65b039
1 parent 86fdceb
commit d65b039
Show file tree

Hide file tree

Showing 45 changed files with 583 additions and 491 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -18,7 +18,7 @@ include NOTICE
 include LICENSE.txt
 graft licenses/
 include README.md
-recursive-include superset/data *
+recursive-include superset/examples *
 recursive-include superset/migrations *
 recursive-include superset/static *
 recursive-exclude superset/static/assets/docs *

diff --git a/superset/cli.py b/superset/cli.py
@@ -26,7 +26,7 @@
 from pathlib2 import Path
 import yaml
 
-from superset import app, appbuilder, data, db, security_manager
+from superset import app, appbuilder, db, examples, security_manager
 from superset.utils import core as utils, dashboard_import_export, dict_import_export
 
 config = app.config
@@ -46,6 +46,7 @@ def make_shell_context():
 def init():
     """Inits the Superset application"""
     utils.get_or_create_main_db()
+    utils.get_example_database()
     appbuilder.add_permissions(update_perms=True)
     security_manager.sync_role_definitions()
 
@@ -67,66 +68,76 @@ def version(verbose):
     print(Style.RESET_ALL)
 
 
-def load_examples_run(load_test_data):
-    print("Loading examples into {}".format(db))
+def load_examples_run(load_test_data, only_metadata=False, force=False):
+    if only_metadata:
+        print("Loading examples metadata")
+    else:
+        examples_db = utils.get_example_database()
+        print(f"Loading examples metadata and related data into {examples_db}")
 
-    data.load_css_templates()
+    examples.load_css_templates()
 
     print("Loading energy related dataset")
-    data.load_energy()
+    examples.load_energy(only_metadata, force)
 
     print("Loading [World Bank's Health Nutrition and Population Stats]")
-    data.load_world_bank_health_n_pop()
+    examples.load_world_bank_health_n_pop(only_metadata, force)
 
     print("Loading [Birth names]")
-    data.load_birth_names()
+    examples.load_birth_names(only_metadata, force)
 
     print("Loading [Unicode test data]")
-    data.load_unicode_test_data()
+    examples.load_unicode_test_data(only_metadata, force)
 
     if not load_test_data:
         print("Loading [Random time series data]")
-        data.load_random_time_series_data()
+        examples.load_random_time_series_data(only_metadata, force)
 
         print("Loading [Random long/lat data]")
-        data.load_long_lat_data()
+        examples.load_long_lat_data(only_metadata, force)
 
         print("Loading [Country Map data]")
-        data.load_country_map_data()
+        examples.load_country_map_data(only_metadata, force)
 
         print("Loading [Multiformat time series]")
-        data.load_multiformat_time_series()
+        examples.load_multiformat_time_series(only_metadata, force)
 
         print("Loading [Paris GeoJson]")
-        data.load_paris_iris_geojson()
+        examples.load_paris_iris_geojson(only_metadata, force)
 
         print("Loading [San Francisco population polygons]")
-        data.load_sf_population_polygons()
+        examples.load_sf_population_polygons(only_metadata, force)
 
         print("Loading [Flights data]")
-        data.load_flights()
+        examples.load_flights(only_metadata, force)
 
         print("Loading [BART lines]")
-        data.load_bart_lines()
+        examples.load_bart_lines(only_metadata, force)
 
         print("Loading [Multi Line]")
-        data.load_multi_line()
+        examples.load_multi_line(only_metadata)
 
         print("Loading [Misc Charts] dashboard")
-        data.load_misc_dashboard()
+        examples.load_misc_dashboard()
 
         print("Loading DECK.gl demo")
-        data.load_deck_dash()
+        examples.load_deck_dash()
 
     print("Loading [Tabbed dashboard]")
-    data.load_tabbed_dashboard()
+    examples.load_tabbed_dashboard(only_metadata)
 
 
 @app.cli.command()
 @click.option("--load-test-data", "-t", is_flag=True, help="Load additional test data")
-def load_examples(load_test_data):
+@click.option(
+    "--only-metadata", "-m", is_flag=True, help="Only load metadata, skip actual data"
+)
+@click.option(
+    "--force", "-f", is_flag=True, help="Force load data even if table already exists"
+)
+def load_examples(load_test_data, only_metadata=False, force=False):
     """Loads a set of Slices and Dashboards and a supporting dataset """
-    load_examples_run(load_test_data)
+    load_examples_run(load_test_data, only_metadata, force)
 
 
 @app.cli.command()
@@ -405,7 +416,7 @@ def load_test_users_run():
         for perm in security_manager.find_role("Gamma").permissions:
             security_manager.add_permission_role(gamma_sqllab_role, perm)
         utils.get_or_create_main_db()
-        db_perm = utils.get_main_database(security_manager.get_session).perm
+        db_perm = utils.get_main_database().perm
         security_manager.add_permission_view_menu("database_access", db_perm)
         db_pvm = security_manager.find_permission_view_menu(
             view_menu_name=db_perm, permission_name="database_access"

diff --git a/superset/config.py b/superset/config.py
@@ -617,6 +617,10 @@ class CeleryConfig(object):
     "force_https_permanent": False,
 }
 
+# URI to database storing the example data, points to
+# SQLALCHEMY_DATABASE_URI by default if set to `None`
+SQLALCHEMY_EXAMPLES_URI = None
+
 try:
     if CONFIG_PATH_ENV_VAR in os.environ:
         # Explicitly import config module that is not in pythonpath; useful

diff --git a/superset/connectors/connector_registry.py b/superset/connectors/connector_registry.py
@@ -55,18 +55,9 @@ def get_datasource_by_name(
         cls, session, datasource_type, datasource_name, schema, database_name
     ):
         datasource_class = ConnectorRegistry.sources[datasource_type]
-        datasources = session.query(datasource_class).all()
-
-        # Filter datasoures that don't have database.
-        db_ds = [
-            d
-            for d in datasources
-            if d.database
-            and d.database.name == database_name
-            and d.name == datasource_name
-            and schema == schema
-        ]
-        return db_ds[0]
+        return datasource_class.get_datasource_by_name(
+            session, datasource_name, schema, database_name
+        )
 
     @classmethod
     def query_datasources_by_permissions(cls, session, database, permissions):

diff --git a/superset/connectors/druid/models.py b/superset/connectors/druid/models.py
@@ -732,6 +732,16 @@ def time_offset(granularity):
             return 6 * 24 * 3600 * 1000  # 6 days
         return 0
 
+    @classmethod
+    def get_datasource_by_name(cls, session, datasource_name, schema, database_name):
+        query = (
+            session.query(cls)
+            .join(DruidCluster)
+            .filter(cls.datasource_name == datasource_name)
+            .filter(DruidCluster.cluster_name == database_name)
+        )
+        return query.first()
+
     # uses https://en.wikipedia.org/wiki/ISO_8601
     # http://druid.io/docs/0.8.0/querying/granularities.html
     # TODO: pass origin from the UI

diff --git a/superset/connectors/sqla/models.py b/superset/connectors/sqla/models.py
@@ -374,6 +374,21 @@ def datasource_name(self):
     def database_name(self):
         return self.database.name
 
+    @classmethod
+    def get_datasource_by_name(cls, session, datasource_name, schema, database_name):
+        schema = schema or None
+        query = (
+            session.query(cls)
+            .join(Database)
+            .filter(cls.table_name == datasource_name)
+            .filter(Database.database_name == database_name)
+        )
+        # Handling schema being '' or None, which is easier to handle
+        # in python than in the SQLA query in a multi-dialect way
+        for tbl in query.all():
+            if schema == (tbl.schema or None):
+                return tbl
+
     @property
     def link(self):
         name = escape(self.name)

diff --git a/superset/data/__init__.py → superset/examples/__init__.py b/superset/data/__init__.py → superset/examples/__init__.py
diff --git a/superset/data/bart_lines.py → superset/examples/bart_lines.py b/superset/data/bart_lines.py → superset/examples/bart_lines.py
@@ -21,37 +21,42 @@
 from sqlalchemy import String, Text
 
 from superset import db
-from superset.utils.core import get_or_create_main_db
-from .helpers import TBL, get_example_data
+from superset.utils.core import get_example_database
+from .helpers import get_example_data, TBL
 
 
-def load_bart_lines():
+def load_bart_lines(only_metadata=False, force=False):
     tbl_name = "bart_lines"
-    content = get_example_data("bart-lines.json.gz")
-    df = pd.read_json(content, encoding="latin-1")
-    df["path_json"] = df.path.map(json.dumps)
-    df["polyline"] = df.path.map(polyline.encode)
-    del df["path"]
+    database = get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        content = get_example_data("bart-lines.json.gz")
+        df = pd.read_json(content, encoding="latin-1")
+        df["path_json"] = df.path.map(json.dumps)
+        df["polyline"] = df.path.map(polyline.encode)
+        del df["path"]
+
+        df.to_sql(
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={
+                "color": String(255),
+                "name": String(255),
+                "polyline": Text,
+                "path_json": Text,
+            },
+            index=False,
+        )
 
-    df.to_sql(
-        tbl_name,
-        db.engine,
-        if_exists="replace",
-        chunksize=500,
-        dtype={
-            "color": String(255),
-            "name": String(255),
-            "polyline": Text,
-            "path_json": Text,
-        },
-        index=False,
-    )
     print("Creating table {} reference".format(tbl_name))
     tbl = db.session.query(TBL).filter_by(table_name=tbl_name).first()
     if not tbl:
         tbl = TBL(table_name=tbl_name)
     tbl.description = "BART lines"
-    tbl.database = get_or_create_main_db()
+    tbl.database = database
     db.session.merge(tbl)
     db.session.commit()
     tbl.fetch_metadata()
diff --git a/superset/data/birth_names.py → superset/examples/birth_names.py b/superset/data/birth_names.py → superset/examples/birth_names.py
@@ -23,7 +23,7 @@
 
 from superset import db, security_manager
 from superset.connectors.sqla.models import SqlMetric, TableColumn
-from superset.utils.core import get_or_create_main_db
+from superset.utils.core import get_example_database
 from .helpers import (
     config,
     Dash,
@@ -36,33 +36,39 @@
 )
 
 
-def load_birth_names():
+def load_birth_names(only_metadata=False, force=False):
     """Loading birth name dataset from a zip file in the repo"""
-    data = get_example_data("birth_names.json.gz")
-    pdf = pd.read_json(data)
-    pdf.ds = pd.to_datetime(pdf.ds, unit="ms")
-    pdf.to_sql(
-        "birth_names",
-        db.engine,
-        if_exists="replace",
-        chunksize=500,
-        dtype={
-            "ds": DateTime,
-            "gender": String(16),
-            "state": String(10),
-            "name": String(255),
-        },
-        index=False,
-    )
-    print("Done loading table!")
-    print("-" * 80)
+    # pylint: disable=too-many-locals
+    tbl_name = "birth_names"
+    database = get_example_database()
+    table_exists = database.has_table_by_name(tbl_name)
+
+    if not only_metadata and (not table_exists or force):
+        pdf = pd.read_json(get_example_data("birth_names.json.gz"))
+        pdf.ds = pd.to_datetime(pdf.ds, unit="ms")
+        pdf.to_sql(
+            tbl_name,
+            database.get_sqla_engine(),
+            if_exists="replace",
+            chunksize=500,
+            dtype={
+                "ds": DateTime,
+                "gender": String(16),
+                "state": String(10),
+                "name": String(255),
+            },
+            index=False,
+        )
+        print("Done loading table!")
+        print("-" * 80)
 
-    print("Creating table [birth_names] reference")
-    obj = db.session.query(TBL).filter_by(table_name="birth_names").first()
+    obj = db.session.query(TBL).filter_by(table_name=tbl_name).first()
     if not obj:
-        obj = TBL(table_name="birth_names")
+        print(f"Creating table [{tbl_name}] reference")
+        obj = TBL(table_name=tbl_name)
+        db.session.add(obj)
     obj.main_dttm_col = "ds"
-    obj.database = get_or_create_main_db()
+    obj.database = database
     obj.filter_select_enabled = True
 
     if not any(col.column_name == "num_california" for col in obj.columns):
@@ -79,7 +85,6 @@ def load_birth_names():
         col = str(column("num").compile(db.engine))
         obj.metrics.append(SqlMetric(metric_name="sum__num", expression=f"SUM({col})"))
 
-    db.session.merge(obj)
     db.session.commit()
     obj.fetch_metadata()
     tbl = obj
@@ -384,10 +389,12 @@ def load_birth_names():
         merge_slice(slc)
 
     print("Creating a dashboard")
-    dash = db.session.query(Dash).filter_by(dashboard_title="Births").first()
+    dash = db.session.query(Dash).filter_by(slug="births").first()
 
     if not dash:
         dash = Dash()
+        db.session.add(dash)
+    dash.published = True
     js = textwrap.dedent(
         # pylint: disable=line-too-long
         """\
@@ -649,5 +656,4 @@ def load_birth_names():
     dash.dashboard_title = "Births"
     dash.position_json = json.dumps(pos, indent=4)
     dash.slug = "births"
-    db.session.merge(dash)
     db.session.commit()
diff --git a/superset/data/countries.md → superset/examples/countries.md b/superset/data/countries.md → superset/examples/countries.md
diff --git a/superset/data/countries.py → superset/examples/countries.py b/superset/data/countries.py → superset/examples/countries.py