Skip to content

Commit

Permalink
Experiment Profiling [Resolves #557] (#558)
Browse files Browse the repository at this point in the history
- Add cProfile bindings to experiment.run, write to profiling_stats
directory in project path.
- Add cli option for profiling
- Add section on profiling to docs
  • Loading branch information
thcrock authored Jan 3, 2019
1 parent 0fea3ee commit c608a7c
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 5 deletions.
14 changes: 14 additions & 0 deletions docs/sources/experiments/running.md
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,9 @@ Here's an example query, which returns the top 10 model groups by precision at t
limit 10
```




## Inspecting an Experiment before running

Before you run an experiment, you can inspect properties of the Experiment object to ensure that it is configured in the way you want. Some examples:
Expand All @@ -293,6 +296,17 @@ Before you run an experiment, you can inspect properties of the Experiment objec

## Optimizing Experiment Performance

### Profiling an Experiment

Experiment running slowly? Try the `profile` keyword argument, or `--profile` in the command line. This will output a cProfile file to the project path's `profiling_stats` directory. This is a binary format but can be read with a variety of visualization programs.

[snakeviz](https://jiffyclub.github.io/snakeviz/) - A browser based graphical viewer.
[tuna](https://github.com/nschloe/tuna) - Another browser based graphical viewer
[gprof2dot](https://github.com/jrfonseca/gprof2dot) - A command-line tool to convert files to graphviz format
[pyprof2calltree](https://pypi.org/project/pyprof2calltree/) - A command-line tool to convert files to Valgrind log format, for viewing in established viewers like KCacheGrind

Looking at the profile through a visualization program, you can see which portions of the experiment are taking up the most time. Based on this, you may be able to prioritize changes. For instance, if cohort/label/feature table generation are taking up the bulk of the time, you may add indexes to source tables, or increase the number of database processes. On the other hand, if model training is the culprit, you may temporarily try a smaller grid to get results more quickly.

### materialize_subquery_fromobjs
By default, experiments will inspect the `from_obj` of every feature aggregation to see if it looks like a subquery, create a table out of it if so, index it on the `knowledge_date_column` and `entity_id`, and use that for running feature queries. This can make feature generation go a lot faster if the `from_obj` takes a decent amount of time to run and/or there are a lot of as-of-dates in the experiment. It won't do this for `from_objs` that are just tables, or simple joins (e.g. `entities join events using (entity_id)`) as the existing indexes you have on those tables should work just fine.

Expand Down
4 changes: 1 addition & 3 deletions src/tests/catwalk_tests/test_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pandas as pd
import yaml
from moto import mock_s3
import boto3
from numpy.testing import assert_almost_equal

from triage.component.catwalk.storage import (
Expand All @@ -24,8 +25,6 @@ def __init__(self, val):

def test_S3Store():
with mock_s3():
import boto3

client = boto3.client("s3")
client.create_bucket(Bucket="test_bucket", ACL="public-read-write")
store = S3Store(f"s3://test_bucket/a_path")
Expand Down Expand Up @@ -176,7 +175,6 @@ def test_as_of_dates_entity_date_index(self):

def test_s3_save(self):
with mock_s3():
import boto3

client = boto3.client("s3")
client.create_bucket(Bucket="fake-matrix-bucket", ACL="public-read-write")
Expand Down
13 changes: 13 additions & 0 deletions src/tests/test_experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,19 @@ def test_custom_label_name(experiment_class):
assert experiment.planner.label_names == ["custom_label_name"]


def test_profiling(db_engine):
populate_source_data(db_engine)
with TemporaryDirectory() as temp_dir:
project_path = os.path.join(temp_dir, "inspections")
experiment = SingleThreadedExperiment(
config=sample_config(),
db_engine=db_engine,
project_path=project_path,
profile=True
).run()
assert len(os.listdir(os.path.join(project_path, "profiling_stats"))) == 1


@parametrize_experiment_classes
def test_baselines_with_missing_features(experiment_class):
with testing.postgresql.Postgresql() as postgresql:
Expand Down
7 changes: 7 additions & 0 deletions src/triage/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,12 @@ def __init__(self, parser):
action="store_true",
help="only validate the config file not running Experiment",
)
parser.add_argument(
"--profile",
action="store_true",
dest="profile",
help="Record the time spent in various functions using cProfile"
)

parser.add_argument(
"--no-materialize-fromobjs",
Expand All @@ -223,6 +229,7 @@ def experiment(self):
"replace": self.args.replace,
"materialize_subquery_fromobjs": self.args.materialize_fromobjs,
"matrix_storage_class": self.matrix_storage_map[self.args.matrix_format],
"profile": self.args.profile,
}
if self.args.n_db_processes > 1 or self.args.n_processes > 1:
experiment = MultiCoreExperiment(
Expand Down
28 changes: 26 additions & 2 deletions src/triage/experiments/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import logging
from abc import ABC, abstractmethod
import cProfile
import marshal
import time

from descriptors import cachedproperty
from timeout import timeout
Expand Down Expand Up @@ -68,6 +71,7 @@ class ExperimentBase(ABC):
materialize_subquery_fromobjs (bool, default True) Whether or not to create and index
tables for feature "from objects" that are subqueries. Can speed up performance
when building features for many as-of-dates.
profile (bool)
"""

cleanup_timeout = 60 # seconds
Expand All @@ -81,7 +85,8 @@ def __init__(
replace=True,
cleanup=False,
cleanup_timeout=None,
materialize_subquery_fromobjs=True
materialize_subquery_fromobjs=True,
profile=False,
):
self._check_config_version(config)
self.config = config
Expand Down Expand Up @@ -117,6 +122,8 @@ def __init__(
self.cleanup_timeout = (
self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout
)
self.profile = profile
logging.info("Generate profiling stats? (profile option): %s", self.profile)

def _check_config_version(self, config):
if "config_version" in config:
Expand Down Expand Up @@ -634,9 +641,26 @@ def clean_up_tables(self):
self.cohort_table_generator.clean_up()
self.label_generator.clean_up(self.labels_table_name)

def _run_profile(self):
cp = cProfile.Profile()
cp.runcall(self._run)
store = self.project_storage.get_store(
["profiling_stats"],
f"{int(time.time())}.profile"
)
with store.open('wb') as fd:
cp.create_stats()
marshal.dump(cp.stats, fd)
logging.info("Profiling stats of this Triage run calculated and written to %s"
"in cProfile format.",
store)

def run(self):
try:
self._run()
if self.profile:
self._run_profile()
else:
self._run()
except Exception:
logging.exception("Run interrupted by uncaught exception")
raise
Expand Down

0 comments on commit c608a7c

Please sign in to comment.