Experiment Profiling [Resolves #557] (#558)

- Add cProfile bindings to experiment.run, write to profiling_stats directory in project path. - Add cli option for profiling - Add section on profiling to docs
dssg · Jan 3, 2019 · c608a7c · c608a7c
1 parent 0fea3ee
commit c608a7c
Show file tree

Hide file tree

Showing 5 changed files with 61 additions and 5 deletions.
diff --git a/docs/sources/experiments/running.md b/docs/sources/experiments/running.md
@@ -283,6 +283,9 @@ Here's an example query, which returns the top 10 model groups by precision at t
     limit 10
 ```
 
+
+
+
 ## Inspecting an Experiment before running
 
 Before you run an experiment, you can inspect properties of the Experiment object to ensure that it is configured in the way you want. Some examples:
@@ -293,6 +296,17 @@ Before you run an experiment, you can inspect properties of the Experiment objec
 
 ## Optimizing Experiment Performance
 
+### Profiling an Experiment
+
+Experiment running slowly? Try the `profile` keyword argument, or `--profile` in the command line. This will output a cProfile file to the project path's `profiling_stats` directory.  This is a binary format but can be read with a variety of visualization programs.
+
+[snakeviz](https://jiffyclub.github.io/snakeviz/) - A browser based graphical viewer.
+[tuna](https://github.com/nschloe/tuna) - Another browser based graphical viewer
+[gprof2dot](https://github.com/jrfonseca/gprof2dot) - A command-line tool to convert files to graphviz format
+[pyprof2calltree](https://pypi.org/project/pyprof2calltree/) - A command-line tool to convert files to Valgrind log format, for viewing in established viewers like KCacheGrind
+
+Looking at the profile through a visualization program, you can see which portions of the experiment are taking up the most time. Based on this, you may be able to prioritize changes. For instance, if cohort/label/feature table generation are taking up the bulk of the time, you may add indexes to source tables, or increase the number of database processes. On the other hand, if model training is the culprit, you may temporarily try a smaller grid to get results more quickly.
+
 ### materialize_subquery_fromobjs
 By default, experiments will inspect the `from_obj` of every feature aggregation to see if it looks like a subquery, create a table out of it if so, index it on the `knowledge_date_column` and `entity_id`, and use that for running feature queries. This can make feature generation go a lot faster if the `from_obj` takes a decent amount of time to run and/or there are a lot of as-of-dates in the experiment. It won't do this for `from_objs` that are just tables, or simple joins (e.g. `entities join events using (entity_id)`) as the existing indexes you have on those tables should work just fine.
 

diff --git a/src/tests/catwalk_tests/test_storage.py b/src/tests/catwalk_tests/test_storage.py
@@ -6,6 +6,7 @@
 import pandas as pd
 import yaml
 from moto import mock_s3
+import boto3
 from numpy.testing import assert_almost_equal
 
 from triage.component.catwalk.storage import (
@@ -24,8 +25,6 @@ def __init__(self, val):
 
 def test_S3Store():
     with mock_s3():
-        import boto3
-
         client = boto3.client("s3")
         client.create_bucket(Bucket="test_bucket", ACL="public-read-write")
         store = S3Store(f"s3://test_bucket/a_path")
@@ -176,7 +175,6 @@ def test_as_of_dates_entity_date_index(self):
 
     def test_s3_save(self):
         with mock_s3():
-            import boto3
 
             client = boto3.client("s3")
             client.create_bucket(Bucket="fake-matrix-bucket", ACL="public-read-write")

diff --git a/src/tests/test_experiments.py b/src/tests/test_experiments.py
@@ -351,6 +351,19 @@ def test_custom_label_name(experiment_class):
             assert experiment.planner.label_names == ["custom_label_name"]
 
 
+def test_profiling(db_engine):
+    populate_source_data(db_engine)
+    with TemporaryDirectory() as temp_dir:
+        project_path = os.path.join(temp_dir, "inspections")
+        experiment = SingleThreadedExperiment(
+            config=sample_config(),
+            db_engine=db_engine,
+            project_path=project_path,
+            profile=True
+        ).run()
+        assert len(os.listdir(os.path.join(project_path, "profiling_stats"))) == 1
+
+
 @parametrize_experiment_classes
 def test_baselines_with_missing_features(experiment_class):
     with testing.postgresql.Postgresql() as postgresql:

diff --git a/src/triage/cli.py b/src/triage/cli.py
@@ -200,6 +200,12 @@ def __init__(self, parser):
             action="store_true",
             help="only validate the config file not running Experiment",
         )
+        parser.add_argument(
+            "--profile",
+            action="store_true",
+            dest="profile",
+            help="Record the time spent in various functions using cProfile"
+        )
 
         parser.add_argument(
             "--no-materialize-fromobjs",
@@ -223,6 +229,7 @@ def experiment(self):
             "replace": self.args.replace,
             "materialize_subquery_fromobjs": self.args.materialize_fromobjs,
             "matrix_storage_class": self.matrix_storage_map[self.args.matrix_format],
+            "profile": self.args.profile,
         }
         if self.args.n_db_processes > 1 or self.args.n_processes > 1:
             experiment = MultiCoreExperiment(

diff --git a/src/triage/experiments/base.py b/src/triage/experiments/base.py
@@ -1,5 +1,8 @@
 import logging
 from abc import ABC, abstractmethod
+import cProfile
+import marshal
+import time
 
 from descriptors import cachedproperty
 from timeout import timeout
@@ -68,6 +71,7 @@ class ExperimentBase(ABC):
         materialize_subquery_fromobjs (bool, default True) Whether or not to create and index
             tables for feature "from objects" that are subqueries. Can speed up performance
             when building features for many as-of-dates.
+        profile (bool)
     """
 
     cleanup_timeout = 60  # seconds
@@ -81,7 +85,8 @@ def __init__(
         replace=True,
         cleanup=False,
         cleanup_timeout=None,
-        materialize_subquery_fromobjs=True
+        materialize_subquery_fromobjs=True,
+        profile=False,
     ):
         self._check_config_version(config)
         self.config = config
@@ -117,6 +122,8 @@ def __init__(
         self.cleanup_timeout = (
             self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout
         )
+        self.profile = profile
+        logging.info("Generate profiling stats? (profile option): %s", self.profile)
 
     def _check_config_version(self, config):
         if "config_version" in config:
@@ -634,9 +641,26 @@ def clean_up_tables(self):
             self.cohort_table_generator.clean_up()
             self.label_generator.clean_up(self.labels_table_name)
 
+    def _run_profile(self):
+        cp = cProfile.Profile()
+        cp.runcall(self._run)
+        store = self.project_storage.get_store(
+            ["profiling_stats"],
+            f"{int(time.time())}.profile"
+        )
+        with store.open('wb') as fd:
+            cp.create_stats()
+            marshal.dump(cp.stats, fd)
+            logging.info("Profiling stats of this Triage run calculated and written to %s"
+                         "in cProfile format.",
+                         store)
+
     def run(self):
         try:
-            self._run()
+            if self.profile:
+                self._run_profile()
+            else:
+                self._run()
         except Exception:
             logging.exception("Run interrupted by uncaught exception")
             raise