diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index ec1e5b948..bfd151b8f 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -8,10 +8,10 @@ jobs: runs-on: ubuntu-18.04 steps: - uses: actions/checkout@master - - name: Set up Python 3.7 + - name: Set up Python 3.8 uses: actions/setup-python@v1 with: - python-version: 3.7 + python-version: 3.8 - name: Install pypa/build run: >- python -m diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 78f9f7772..5d22fb7f8 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -22,7 +22,7 @@ jobs: --health-retries 5 strategy: matrix: - python-version: [3.7, 3.8, 3.9] + python-version: ['3.8', '3.9', '3.10'] steps: - uses: actions/checkout@v2 diff --git a/.travis.yml b/.travis.yml index 0d1f0677a..d57db252d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,6 @@ dist: bionic language: python -python: ["3.6", "3.7"] +python: ["3.8", "3.9", "3.10"] addons: postgresql: '11' apt: diff --git a/Dockerfile b/Dockerfile index 4665f1919..aaacb20af 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.7-slim AS development +FROM python:3.8-slim AS development LABEL creator="Center for Data Science and Public Policy (DSaPP)" \ maintainer="Adolfo De Unánue " \ @@ -60,7 +60,7 @@ RUN pip install -e . ENTRYPOINT [ "bash" ] -FROM python:3.7-slim AS master +FROM python:3.8-slim AS master LABEL triage.version="master" diff --git a/README.md b/README.md index 8aac62da0..3d985389f 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Triage is designed to: To install Triage, you need: -- Python 3.7+ +- Python 3.8+ - A PostgreSQL 9.6+ database with your source data (events, geographical data, etc) loaded. - **NOTE**: If your database is PostgreSQL 11+ you will get some diff --git a/docs/sources/dirtyduck/dirty_duckling.md b/docs/sources/dirtyduck/dirty_duckling.md index 7ca9fa5bf..5d1fadaa9 100644 --- a/docs/sources/dirtyduck/dirty_duckling.md +++ b/docs/sources/dirtyduck/dirty_duckling.md @@ -242,7 +242,7 @@ triage experiment experiments/dirty-duckling.yaml ``` That's it! If you see this message in your screen: - 2020-08-20 16:56:56 - SUCCESS Training, testing and evaluatiog models completed + 2020-08-20 16:56:56 - SUCCESS Training, testing and evaluating models completed 2020-08-20 16:56:56 - SUCCESS All matrices that were supposed to be build were built. Awesome! 2020-08-20 16:56:56 - SUCCESS All models that were supposed to be trained were trained. Awesome! 2020-08-20 16:56:56 - SUCCESS Experiment (a336de4800cec8964569d051dc56f85d) ran through completion diff --git a/docs/sources/experiments/running.md b/docs/sources/experiments/running.md index e71e97de9..ecc76a584 100644 --- a/docs/sources/experiments/running.md +++ b/docs/sources/experiments/running.md @@ -4,7 +4,7 @@ To use a Triage experiment, you first need: -- Python 3.7+ +- Python 3.8+ - A PostgreSQL (v9.6+) database with your source data (events, geographical data, etc) loaded. - Ample space on an available disk (or S3) to store the needed matrices and models for your experiment - An experiment definition (see [Experiment configuration](experiment-config.md)) diff --git a/docs/sources/index.md b/docs/sources/index.md index 680a150a4..a2c1f04c4 100644 --- a/docs/sources/index.md +++ b/docs/sources/index.md @@ -25,7 +25,7 @@ Triage is designed to: To install Triage, you need: -- Python 3.7+ +- Python 3.8+ - A PostgreSQL 9.6+ database with your source data (events, geographical data, etc) loaded. - **NOTE**: If your database is PostgreSQL 11+ you will get some diff --git a/docs/sources/quickstart.md b/docs/sources/quickstart.md index 63cc27957..84a5157ff 100644 --- a/docs/sources/quickstart.md +++ b/docs/sources/quickstart.md @@ -137,7 +137,7 @@ defaults for others. The primary parameters to specify (for now) are: - `entity_id`: each `entity_id` affected by an event within the amount of time specified by `label_timespan` after a given `as_of_date` - `outcome`: a binary variable representing the events that happened to each entity, within the period specified by that `as_of_date` and `label_timespan` - The query is parameterized over `as_of_date`, and `label_timespan`. These parameters are passed to your query as named keywords using the Python's [`str.format()`](https://docs.python.org/3.7/library/stdtypes.html#str.format) method. You can use them in your query by surrounding their keywords with curly braces (as in the example below). + The query is parameterized over `as_of_date`, and `label_timespan`. These parameters are passed to your query as named keywords using the Python's [`str.format()`](https://docs.python.org/3.8/library/stdtypes.html#str.format) method. You can use them in your query by surrounding their keywords with curly braces (as in the example below). See our [guide to Labels](https://dssg.github.io/triage/experiments/cohort-labels/) for a more in-depth discussion of this topic. diff --git a/requirement/dev.txt b/requirement/dev.txt index b117eb353..45f5418a4 100644 --- a/requirement/dev.txt +++ b/requirement/dev.txt @@ -1,7 +1,7 @@ -r include/build.txt bumpversion==0.6.0 mkdocs==1.3.0 -pymdown-extensions==7.1 -mkdocs-material==7.0.6 -mkdocstrings==0.12.0 -black==19.10b0 +pymdown-extensions==9.4 +mkdocs-material==8.2.12 +mkdocstrings==0.18.1 +black==22.3.0 diff --git a/requirement/include/build.txt b/requirement/include/build.txt index b82d21b37..e75560c7c 100644 --- a/requirement/include/build.txt +++ b/requirement/include/build.txt @@ -1 +1 @@ -wheel==0.34.2 +wheel==0.37.1 diff --git a/requirement/include/lint.txt b/requirement/include/lint.txt index d0acd85e8..ec6112141 100644 --- a/requirement/include/lint.txt +++ b/requirement/include/lint.txt @@ -1 +1 @@ -flake8==3.8.3 +flake8==4.0.1 diff --git a/requirement/include/test-management.txt b/requirement/include/test-management.txt index 9ed8460c3..b4b6c778c 100644 --- a/requirement/include/test-management.txt +++ b/requirement/include/test-management.txt @@ -1,3 +1,3 @@ -codecov==2.1.7 +codecov==2.1.12 coverage>=4.4 -tox==3.16.1 +tox==3.25.0 diff --git a/requirement/main.txt b/requirement/main.txt index b074802e2..d4044d746 100644 --- a/requirement/main.txt +++ b/requirement/main.txt @@ -1,31 +1,31 @@ -alembic==1.4.2 -SQLAlchemy==1.3.18 -PyYAML==5.4.1 -psycopg2-binary==2.8.5 -python-dateutil==2.8.1 -boto3==1.14.45 -click==7.1.2 -inflection==0.5.0 -numpy==1.21.1 +alembic==1.7.7 +SQLAlchemy==1.3.18 # pyup: ignore +PyYAML==6.0 +psycopg2-binary==2.9.3 +python-dateutil==2.8.2 +boto3==1.22.4 +click==8.1.3 +inflection==0.5.1 +numpy==1.22.3 sqlalchemy-postgres-copy==0.5.0 retrying==1.3.3 Dickens==1.0.1 signalled-timeout==1.0.0 -s3fs==0.4.2 -wrapt==1.13.3 +wrapt==1.14.0 argcmdr==0.7.0 sqlparse==0.4.2 -pebble==4.5.3 +pebble==4.6.3 adjustText==0.7.3 -graphviz==0.14 -requests==2.24.0 -coloredlogs==14.0 +graphviz==0.20 +requests==2.27.1 +coloredlogs==15.0.1 verboselogs==1.7 -scipy==1.5.0 -scikit-learn==0.23.1 -matplotlib==3.3.4 -pandas==1.0.5 -seaborn==0.10.1 +s3fs==0.4.2 # pyup: ignore +scipy==1.8.0 +scikit-learn==1.0.2 +matplotlib==3.5.1 +pandas==1.3.5 # pyup: ignore +seaborn==0.11.2 ohio==0.5.0 diff --git a/requirement/test.txt b/requirement/test.txt index 4e91b1e2f..0ffe10b34 100644 --- a/requirement/test.txt +++ b/requirement/test.txt @@ -1,11 +1,11 @@ -r include/lint.txt -r include/test-management.txt parsedatetime==2.6 -csvkit==1.0.5 -factory_boy==2.12.0 +csvkit==1.0.7 +factory_boy==3.2.1 testing.postgresql==1.3.0 -pytest==5.4.3 #<4.0.0 # pyup: ignore -pytest-cov==2.10.0 -moto==1.3.14 -fakeredis==1.4.1 -hypothesis==5.19.0 +pytest==6.2.5 #<4.0.0 # pyup: ignore +pytest-cov==3.0.0 +moto==3.1.7 +fakeredis==1.7.1 +hypothesis==6.46.1 diff --git a/setup.py b/setup.py index 9e768aa86..7a649114f 100755 --- a/setup.py +++ b/setup.py @@ -7,15 +7,15 @@ ROOT_PATH = Path(__file__).parent -LICENSE_PATH = ROOT_PATH / 'LICENSE' +LICENSE_PATH = ROOT_PATH / "LICENSE" -README_PATH = ROOT_PATH / 'README.md' +README_PATH = ROOT_PATH / "README.md" -REQUIREMENTS_PATH = ROOT_PATH / 'requirement' / 'main.txt' +REQUIREMENTS_PATH = ROOT_PATH / "requirement" / "main.txt" -REQUIREMENTS_TEST_PATH = ROOT_PATH / 'requirement' / 'test.txt' +REQUIREMENTS_TEST_PATH = ROOT_PATH / "requirement" / "test.txt" -REQUIREMENTS_RQ_PATH = ROOT_PATH / 'requirement' / 'extras-rq.txt' +REQUIREMENTS_RQ_PATH = ROOT_PATH / "requirement" / "extras-rq.txt" def stream_requirements(fd): @@ -25,8 +25,8 @@ def stream_requirements(fd): """ for line in fd: - cleaned = re.sub(r'#.*$', '', line).strip() - if cleaned and not cleaned.startswith('-r'): + cleaned = re.sub(r"#.*$", "", line).strip() + if cleaned and not cleaned.startswith("-r"): yield cleaned @@ -43,40 +43,41 @@ def stream_requirements(fd): setup( - name='triage', - version='5.1.1', + name="triage", + version="5.1.1", description="Risk modeling and prediction", long_description=README_PATH.read_text(), long_description_content_type="text/markdown", author="Center for Data Science and Public Policy", - author_email='datascifellows@gmail.com', + author_email="datascifellows@gmail.com", url="https://dssg.github.io/triage/", project_urls={ "Documentation": "https://dssg.github.io/triage/", "Source Code": "https://github.com/dssg/triage", - "Tutorial": "https://dssg.github.io/triage/dirtyduck/" + "Tutorial": "https://dssg.github.io/triage/dirtyduck/", }, - packages=find_packages('src', exclude=['tests', 'tests.*']), - package_dir={'': 'src'}, + packages=find_packages("src", exclude=["tests", "tests.*"]), + package_dir={"": "src"}, include_package_data=True, install_requires=REQUIREMENTS, entry_points={ - 'console_scripts': ['triage = triage.cli:execute'], + "console_scripts": ["triage = triage.cli:execute"], }, - extras_require={'rq': RQ_REQUIREMENTS}, + extras_require={"rq": RQ_REQUIREMENTS}, license="MIT License", zip_safe=False, - keywords='triage', + keywords="triage", classifiers=[ - 'Development Status :: 2 - Pre-Alpha', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: MIT License', - 'Natural Language :: English', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', + "Development Status :: 2 - Pre-Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", ], - python_requires='>=3.6', - test_suite='tests', - tests_require=REQUIREMENTS_TEST + python_requires=">=3.8", + test_suite="tests", + tests_require=REQUIREMENTS_TEST, ) diff --git a/src/tests/architect_tests/test_builders.py b/src/tests/architect_tests/test_builders.py index 4818f3a69..005f27c54 100644 --- a/src/tests/architect_tests/test_builders.py +++ b/src/tests/architect_tests/test_builders.py @@ -3,6 +3,8 @@ import pandas as pd import testing.postgresql +from unittest.mock import Mock + from triage import create_engine from contextlib import contextmanager @@ -421,11 +423,11 @@ def test_load_features_data(): cols = ["entity_id", "as_of_date"] + features[i] temp_df = pd.DataFrame(table, columns=cols) temp_df["as_of_date"] = convert_string_column_to_date(temp_df["as_of_date"]) - features_dfs.append( - ids_dates.merge( - right=temp_df, how="left", on=["entity_id", "as_of_date"] - ).set_index(["entity_id", "as_of_date"]) + merged_df = ids_dates.merge( + right=temp_df, how="left", on=["entity_id", "as_of_date"] ) + merged_df["as_of_date"] = pd.to_datetime(merged_df["as_of_date"]) + features_dfs.append(merged_df.set_index(["entity_id", "as_of_date"])) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: diff --git a/src/tests/catwalk_tests/test_evaluation.py b/src/tests/catwalk_tests/test_evaluation.py index ea2ee8a08..3f193560d 100644 --- a/src/tests/catwalk_tests/test_evaluation.py +++ b/src/tests/catwalk_tests/test_evaluation.py @@ -91,8 +91,7 @@ def populate_subset_data(db_engine, subset, entity_ids, as_of_date=TRAIN_END_TIM from unfiltered_row {query_where_clause} """ - db_engine.execute( - text(insert_query).execution_options(autocommit=True)) + db_engine.execute(text(insert_query).execution_options(autocommit=True)) def test_all_same_labels(db_engine_with_results_schema): @@ -128,13 +127,14 @@ def test_all_same_labels(db_engine_with_results_schema): "entity_id": list(range(num_entities)), "as_of_date": [TRAIN_END_TIME] * num_entities, } - ).set_index(["entity_id", "as_of_date"]).label_value, + ) + .set_index(["entity_id", "as_of_date"]) + .label_value, init_as_of_dates=[TRAIN_END_TIME], ) model_evaluator.evaluate( - trained_model.predict_proba( - labels)[:, 1], fake_matrix_store, model_id + trained_model.predict_proba(labels)[:, 1], fake_matrix_store, model_id ) for metric, best, worst, stochastic in db_engine_with_results_schema.execute( @@ -143,10 +143,7 @@ def test_all_same_labels(db_engine_with_results_schema): where model_id = %s and evaluation_start_time = %s order by 1""", - ( - model_id, - fake_matrix_store.as_of_dates[0] - ), + (model_id, fake_matrix_store.as_of_dates[0]), ): if metric == "accuracy": assert best is not None @@ -174,7 +171,9 @@ def test_subset_labels_and_predictions(db_engine_with_results_schema): "entity_id": list(range(num_entities)), "as_of_date": [TRAIN_END_TIME] * num_entities, } - ).set_index(["entity_id", "as_of_date"]).label_value, + ) + .set_index(["entity_id", "as_of_date"]) + .label_value, init_as_of_dates=[TRAIN_END_TIME], ) @@ -186,9 +185,14 @@ def test_subset_labels_and_predictions(db_engine_with_results_schema): elif subset["name"] == "empty": expected_result = 0 - populate_subset_data(db_engine_with_results_schema, - subset, list(range(num_entities))) - subset_labels, subset_predictions, subset_protected_df = subset_labels_and_predictions( + populate_subset_data( + db_engine_with_results_schema, subset, list(range(num_entities)) + ) + ( + subset_labels, + subset_predictions, + subset_protected_df, + ) = subset_labels_and_predictions( subset_df=query_subset_table( db_engine_with_results_schema, fake_matrix_store.as_of_dates, @@ -229,8 +233,7 @@ def test_evaluating_early_warning(db_engine_with_results_schema): "average precision score", ] }, - {"metrics": ["fbeta@"], - "parameters": [{"beta": 0.75}, {"beta": 1.25}]}, + {"metrics": ["fbeta@"], "parameters": [{"beta": 0.75}, {"beta": 1.25}]}, ] training_metric_groups = [{"metrics": ["accuracy", "roc_auc"]}] @@ -256,7 +259,9 @@ def test_evaluating_early_warning(db_engine_with_results_schema): "entity_id": list(range(num_entities)), "as_of_date": [TRAIN_END_TIME] * num_entities, } - ).set_index(["entity_id", "as_of_date"]).label_value, + ) + .set_index(["entity_id", "as_of_date"]) + .label_value, init_as_of_dates=[TRAIN_END_TIME], ) fake_train_matrix_store = MockMatrixStore( @@ -270,7 +275,9 @@ def test_evaluating_early_warning(db_engine_with_results_schema): "entity_id": list(range(num_entities)), "as_of_date": [TRAIN_END_TIME] * num_entities, } - ).set_index(["entity_id", "as_of_date"]).label_value, + ) + .set_index(["entity_id", "as_of_date"]) + .label_value, init_as_of_dates=[TRAIN_END_TIME], ) @@ -282,14 +289,15 @@ def test_evaluating_early_warning(db_engine_with_results_schema): # ensure that the matrix uuid is present matrix_uuids = [ row[0] - for row in db_engine_with_results_schema.execute("select matrix_uuid from test_results.evaluations") + for row in db_engine_with_results_schema.execute( + "select matrix_uuid from test_results.evaluations" + ) ] assert all(matrix_uuid == "efgh" for matrix_uuid in matrix_uuids) # Evaluate the training metrics and test model_evaluator.evaluate( - trained_model.predict_proba( - labels)[:, 1], fake_train_matrix_store, model_id + trained_model.predict_proba(labels)[:, 1], fake_train_matrix_store, model_id ) records = [ row[0] @@ -309,19 +317,18 @@ def test_evaluating_early_warning(db_engine_with_results_schema): if subset is None: where_hash = "" else: - populate_subset_data(db_engine_with_results_schema, - subset, list(range(num_entities))) + populate_subset_data( + db_engine_with_results_schema, subset, list(range(num_entities)) + ) SubsetFactory(subset_hash=filename_friendly_hash(subset)) session.commit() where_hash = f"and subset_hash = '{filename_friendly_hash(subset)}'" - # Evaluate the testing metrics and test for all of them. model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_test_matrix_store, model_id, subset=subset, - ) records = [ @@ -335,10 +342,7 @@ def test_evaluating_early_warning(db_engine_with_results_schema): {where_hash} order by 1 """, - ( - model_id, - fake_test_matrix_store.as_of_dates[0] - ), + (model_id, fake_test_matrix_store.as_of_dates[0]), ) ] assert records == [ @@ -392,10 +396,7 @@ def test_evaluating_early_warning(db_engine_with_results_schema): evaluation_start_time = %s {where_hash} order by 1""", - ( - model_id, - fake_train_matrix_store.as_of_dates[0] - ), + (model_id, fake_train_matrix_store.as_of_dates[0]), ) ] assert records == ["accuracy", "roc_auc"] @@ -403,7 +404,9 @@ def test_evaluating_early_warning(db_engine_with_results_schema): # ensure that the matrix uuid is present matrix_uuids = [ row[0] - for row in db_engine_with_results_schema.execute("select matrix_uuid from train_results.evaluations") + for row in db_engine_with_results_schema.execute( + "select matrix_uuid from train_results.evaluations" + ) ] assert all(matrix_uuid == "1234" for matrix_uuid in matrix_uuids) @@ -432,12 +435,8 @@ def test_model_scoring_inspections(db_engine_with_results_schema): testing_labels = np.array([1, 0, np.nan, 1, 0]) testing_prediction_probas = np.array([0.56, 0.4, 0.55, 0.5, 0.3]) - training_labels = np.array( - [0, 0, 1, 1, 1, 0, 1, 1] - ) - training_prediction_probas = np.array( - [0.6, 0.4, 0.55, 0.70, 0.3, 0.2, 0.8, 0.6] - ) + training_labels = np.array([0, 0, 1, 1, 1, 0, 1, 1]) + training_prediction_probas = np.array([0.6, 0.4, 0.55, 0.70, 0.3, 0.2, 0.8, 0.6]) fake_train_matrix_store = MockMatrixStore( "train", "efgh", 5, db_engine_with_results_schema, training_labels @@ -556,14 +555,22 @@ def test_ModelEvaluator_needs_evaluation_no_bias_audit(db_engine_with_results_sc # make a test matrix to pass in metadata_overrides = { - 'as_of_date_frequency': as_of_date_frequency, - 'as_of_times': [eval_time], + "as_of_date_frequency": as_of_date_frequency, + "as_of_times": [eval_time], } test_matrix_store = MockMatrixStore( - "test", "1234", 5, db_engine_with_results_schema, metadata_overrides=metadata_overrides + "test", + "1234", + 5, + db_engine_with_results_schema, + metadata_overrides=metadata_overrides, ) train_matrix_store = MockMatrixStore( - "train", "2345", 5, db_engine_with_results_schema, metadata_overrides=metadata_overrides + "train", + "2345", + 5, + db_engine_with_results_schema, + metadata_overrides=metadata_overrides, ) # the evaluated model has test evaluations for precision, but not recall, @@ -575,10 +582,12 @@ def test_ModelEvaluator_needs_evaluation_no_bias_audit(db_engine_with_results_sc subset_hash = filename_friendly_hash(subset) assert ModelEvaluator( - testing_metric_groups=[{ - "metrics": ["precision@", "recall@"], - "thresholds": {"top_n": [100]}, - }], + testing_metric_groups=[ + { + "metrics": ["precision@", "recall@"], + "thresholds": {"top_n": [100]}, + } + ], training_metric_groups=[], db_engine=db_engine_with_results_schema, ).needs_evaluations( @@ -596,10 +605,12 @@ def test_ModelEvaluator_needs_evaluation_no_bias_audit(db_engine_with_results_sc subset_hash = filename_friendly_hash(subset) assert not ModelEvaluator( - testing_metric_groups=[{ - "metrics": ["precision@"], - "thresholds": {"top_n": [100]}, - }], + testing_metric_groups=[ + { + "metrics": ["precision@"], + "thresholds": {"top_n": [100]}, + } + ], training_metric_groups=[], db_engine=db_engine_with_results_schema, ).needs_evaluations( @@ -617,10 +628,12 @@ def test_ModelEvaluator_needs_evaluation_no_bias_audit(db_engine_with_results_sc subset_hash = filename_friendly_hash(subset) assert ModelEvaluator( - testing_metric_groups=[{ - "metrics": ["precision@"], - "thresholds": {"top_n": [100]}, - }], + testing_metric_groups=[ + { + "metrics": ["precision@"], + "thresholds": {"top_n": [100]}, + } + ], training_metric_groups=[], db_engine=db_engine_with_results_schema, ).needs_evaluations( @@ -638,14 +651,18 @@ def test_ModelEvaluator_needs_evaluation_no_bias_audit(db_engine_with_results_sc subset_hash = filename_friendly_hash(subset) assert ModelEvaluator( - testing_metric_groups=[{ - "metrics": ["precision@"], - "thresholds": {"top_n": [100]}, - }], - training_metric_groups=[{ - "metrics": ["precision@"], - "thresholds": {"top_n": [100]}, - }], + testing_metric_groups=[ + { + "metrics": ["precision@"], + "thresholds": {"top_n": [100]}, + } + ], + training_metric_groups=[ + { + "metrics": ["precision@"], + "thresholds": {"top_n": [100]}, + } + ], db_engine=db_engine_with_results_schema, ).needs_evaluations( matrix_store=train_matrix_store, @@ -668,9 +685,7 @@ def test_ModelEvaluator_needs_evaluation_with_bias_audit(db_engine_with_results_ }, ], training_metric_groups=[], - bias_config={ - 'thresholds': {'top_n': [2]} - }, + bias_config={"thresholds": {"top_n": [2]}}, db_engine=db_engine_with_results_schema, ) model_with_evaluations = ModelFactory() @@ -691,11 +706,15 @@ def test_ModelEvaluator_needs_evaluation_with_bias_audit(db_engine_with_results_ # make a test matrix to pass in metadata_overrides = { - 'as_of_date_frequency': as_of_date_frequency, - 'as_of_times': [eval_time], + "as_of_date_frequency": as_of_date_frequency, + "as_of_times": [eval_time], } test_matrix_store = MockMatrixStore( - "test", "1234", 5, db_engine_with_results_schema, metadata_overrides=metadata_overrides + "test", + "1234", + 5, + db_engine_with_results_schema, + metadata_overrides=metadata_overrides, ) assert model_evaluator.needs_evaluations( matrix_store=test_matrix_store, @@ -715,9 +734,7 @@ def test_evaluation_with_protected_df(db_engine_with_results_schema): }, ], training_metric_groups=[], - bias_config={ - 'thresholds': {'top_n': [2]} - }, + bias_config={"thresholds": {"top_n": [2]}}, db_engine=db_engine_with_results_schema, ) testing_labels = np.array([1, 0]) @@ -732,10 +749,12 @@ def test_evaluation_with_protected_df(db_engine_with_results_schema): train_end_time=TRAIN_END_TIME, ) - protected_df = pd.DataFrame({ - "entity_id": fake_test_matrix_store.design_matrix.index.levels[0].tolist(), - "protectedattribute1": "value1" - }) + protected_df = pd.DataFrame( + { + "entity_id": fake_test_matrix_store.design_matrix.index.levels[0].tolist(), + "protectedattribute1": "value1", + } + ) model_evaluator.evaluate( testing_prediction_probas, fake_test_matrix_store, model_id, protected_df @@ -746,10 +765,10 @@ def test_evaluation_with_protected_df(db_engine_with_results_schema): order by 1""", (model_id, fake_test_matrix_store.as_of_dates[0]), ): - assert record['model_id'] == model_id - assert record['parameter'] == '2_abs' - assert record['attribute_name'] == 'protectedattribute1' - assert record['attribute_value'] == 'value1' + assert record["model_id"] == model_id + assert record["parameter"] == "2_abs" + assert record["attribute_name"] == "protectedattribute1" + assert record["attribute_value"] == "value1" def test_evaluation_sorting_with_protected_df(db_engine_with_results_schema): @@ -763,34 +782,37 @@ def test_evaluation_sorting_with_protected_df(db_engine_with_results_schema): }, ], training_metric_groups=[], - bias_config={ - 'thresholds': {'top_n': [2]} - }, + bias_config={"thresholds": {"top_n": [2]}}, db_engine=db_engine_with_results_schema, ) testing_labels = np.array([1, 1, 1, 0, 1]) testing_prediction_probas = np.array([0.56, 0.55, 0.92, 0.85, 0.24]) fake_test_matrix_store = MockMatrixStore( - "test", "1234", 5, db_engine_with_results_schema, - metadata_overrides={'as_of_times': [TRAIN_END_TIME]}, + "test", + "1234", + 5, + db_engine_with_results_schema, + metadata_overrides={"as_of_times": [TRAIN_END_TIME]}, matrix=pd.DataFrame.from_dict( - { - "entity_id": [1, 2, 3, 4, 5], - "as_of_date": [pd.Timestamp(2016, 1, 1)]*5, - "feature_one": [3, 4, 3, 4, 3], - "feature_two": [5, 6, 5, 6, 5], - "label": testing_labels, - } - ).set_index(MatrixStore.indices), + { + "entity_id": [1, 2, 3, 4, 5], + "as_of_date": [pd.Timestamp(2016, 1, 1)] * 5, + "feature_one": [3, 4, 3, 4, 3], + "feature_two": [5, 6, 5, 6, 5], + "label": testing_labels, + } + ).set_index(MatrixStore.indices), init_labels=pd.DataFrame( { "label_value": testing_labels, "entity_id": [1, 2, 3, 4, 5], - "as_of_date": [pd.Timestamp(2016, 1, 1)]*5, + "as_of_date": [pd.Timestamp(2016, 1, 1)] * 5, } - ).set_index(["entity_id", "as_of_date"]).label_value, - init_as_of_dates=[TRAIN_END_TIME] + ) + .set_index(["entity_id", "as_of_date"]) + .label_value, + init_as_of_dates=[TRAIN_END_TIME], ) trained_model, model_id = fake_trained_model( @@ -798,16 +820,19 @@ def test_evaluation_sorting_with_protected_df(db_engine_with_results_schema): train_end_time=TRAIN_END_TIME, ) - protected_df = pd.DataFrame({ - # "entity_id": fake_test_matrix_store.design_matrix.index.levels[0].tolist(), - # "as_of_date": fake_test_matrix_store.design_matrix.index.levels[1].tolist(), - "protectedattribute1": ["low", "low", "low", "high", "high"] - }, index=fake_test_matrix_store.design_matrix.index) + protected_df = pd.DataFrame( + { + # "entity_id": fake_test_matrix_store.design_matrix.index.levels[0].tolist(), + # "as_of_date": fake_test_matrix_store.design_matrix.index.levels[1].tolist(), + "protectedattribute1": ["low", "low", "low", "high", "high"] + }, + index=fake_test_matrix_store.design_matrix.index, + ) # should be low has 3 records, all 1's; high has 2 records, one 1 expected = { "low": {"group_size": 3, "group_label_neg": 0, "group_label_pos": 3}, - "high": {"group_size": 2, "group_label_neg": 1, "group_label_pos": 1} + "high": {"group_size": 2, "group_label_neg": 1, "group_label_pos": 1}, } model_evaluator.evaluate( @@ -820,30 +845,24 @@ def test_evaluation_sorting_with_protected_df(db_engine_with_results_schema): order by 1""", (model_id, fake_test_matrix_store.as_of_dates[0]), ): - assert record['model_id'] == model_id - assert record['parameter'] == '2_abs' - assert record['attribute_name'] == 'protectedattribute1' - for col, value in expected[record['attribute_value']].items(): + assert record["model_id"] == model_id + assert record["parameter"] == "2_abs" + assert record["attribute_name"] == "protectedattribute1" + for col, value in expected[record["attribute_value"]].items(): assert record[col] == value - def test_generate_binary_at_x(): - input_array = np.array( - [0.9, 0.8, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.6]) + input_array = np.array([0.9, 0.8, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.6]) # bug can arise when the same value spans both sides of threshold assert_array_equal( generate_binary_at_x(input_array, 50, "percentile"), - np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) + np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0]), ) assert_array_equal( - generate_binary_at_x(input_array, 2), - np.array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0]) + generate_binary_at_x(input_array, 2), np.array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0]) ) - assert_array_equal( - generate_binary_at_x(np.array([]), 2), - np.array([]) - ) + assert_array_equal(generate_binary_at_x(np.array([]), 2), np.array([])) diff --git a/src/tests/catwalk_tests/test_storage.py b/src/tests/catwalk_tests/test_storage.py index 53c0a589d..6dbb019fb 100644 --- a/src/tests/catwalk_tests/test_storage.py +++ b/src/tests/catwalk_tests/test_storage.py @@ -25,31 +25,38 @@ class SomeClass: - def __init__(self, val): self.val = val +@mock_s3 def test_S3Store(): - with mock_s3(): - client = boto3.client("s3") - client.create_bucket(Bucket="test_bucket", ACL="public-read-write") - store = S3Store(f"s3://test_bucket/a_path") - assert not store.exists() - store.write("val".encode("utf-8")) - assert store.exists() - newVal = store.load() - assert newVal.decode("utf-8") == "val" - store.delete() - assert not store.exists() + client = boto3.client("s3") + client.create_bucket( + Bucket="test_bucket", + ACL="public-read-write", + CreateBucketConfiguration={"LocationConstraint": "us-east-2"}, + ) + store = S3Store(f"s3://test_bucket/a_path") + assert not store.exists() + store.write("val".encode("utf-8")) + assert store.exists() + newVal = store.load() + assert newVal.decode("utf-8") == "val" + store.delete() + assert not store.exists() @mock_s3 def test_S3Store_large(): - client = boto3.client('s3') - client.create_bucket(Bucket='test_bucket', ACL='public-read-write') + client = boto3.client("s3") + client.create_bucket( + Bucket="test_bucket", + ACL="public-read-write", + CreateBucketConfiguration={"LocationConstraint": "us-east-2"}, + ) - store = S3Store('s3://test_bucket/a_path') + store = S3Store("s3://test_bucket/a_path") assert not store.exists() # NOTE: The issue under test (currently) arises when too large a "part" @@ -77,24 +84,24 @@ def test_S3Store_large(): # NOTE: to the point of self-invalidation. (But, this should do the # NOTE: trick; and, we can always increase the payload size here, or # NOTE: otherwise tweak configuration, as necessary.) - one_mb = 2 ** 20 + one_mb = 2**20 payload = b"0" * (10 * one_mb) # 10MiB text of all zeros - with CallSpy('botocore.client.BaseClient._make_api_call') as spy: + with CallSpy("botocore.client.BaseClient._make_api_call") as spy: store.write(payload) call_args = [call[0] for call in spy.calls] call_methods = [args[1] for args in call_args] assert call_methods == [ - 'CreateMultipartUpload', - 'UploadPart', - 'UploadPart', - 'CompleteMultipartUpload', + "CreateMultipartUpload", + "UploadPart", + "UploadPart", + "CompleteMultipartUpload", ] upload_args = call_args[1] - upload_body = upload_args[2]['Body'] + upload_body = upload_args[2]["Body"] # NOTE: Why is this a BufferIO rather than the underlying buffer?! # NOTE: (Would have expected the result of BufferIO.read() -- str.) @@ -123,22 +130,22 @@ def test_FSStore(): def test_ModelStorageEngine_nocaching(project_storage): mse = ModelStorageEngine(project_storage) - mse.write('testobject', 'myhash') - assert mse.exists('myhash') - assert mse.load('myhash') == 'testobject' - assert 'myhash' not in mse.cache + mse.write("testobject", "myhash") + assert mse.exists("myhash") + assert mse.load("myhash") == "testobject" + assert "myhash" not in mse.cache def test_ModelStorageEngine_caching(project_storage): mse = ModelStorageEngine(project_storage) with mse.cache_models(): - mse.write('testobject', 'myhash') + mse.write("testobject", "myhash") with mock.patch.object(mse, "_get_store") as get_store_mock: - assert mse.load('myhash') == 'testobject' + assert mse.load("myhash") == "testobject" assert not get_store_mock.called - assert 'myhash' in mse.cache + assert "myhash" in mse.cache # when cache_models goes out of scope the cache should be empty - assert 'myhash' not in mse.cache + assert "myhash" not in mse.cache DATA_DICT = OrderedDict( @@ -240,7 +247,7 @@ def test_MatrixStore_save(): "as_of_date": [pd.Timestamp(2017, 1, 1), pd.Timestamp(2017, 1, 1)], "feature_one": [0.5, 0.6], "feature_two": [0.5, 0.6], - "label": [1, 0] + "label": [1, 0], } df = pd.DataFrame.from_dict(data) labels = df.pop("label") @@ -250,10 +257,7 @@ def test_MatrixStore_save(): matrix_store.matrix_label_tuple = df, labels matrix_store.save() - assert_frame_equal( - matrix_store.design_matrix, - df - ) + assert_frame_equal(matrix_store.design_matrix, df) def test_MatrixStore_caching(): @@ -276,7 +280,7 @@ def test_as_of_dates(project_storage): pd.Timestamp(2017, 1, 1), pd.Timestamp(2017, 1, 1), ], - "label": [1, 0, 1, 0] + "label": [1, 0, 1, 0], } df = pd.DataFrame.from_dict(data) matrix_store = CSVMatrixStore( @@ -284,25 +288,32 @@ def test_as_of_dates(project_storage): [], "test", matrix=df, - metadata={"indices": ["entity_id", "as_of_date"], "label_name": "label"} + metadata={"indices": ["entity_id", "as_of_date"], "label_name": "label"}, ) - assert matrix_store.as_of_dates == [datetime.date(2016, 1, 1), datetime.date(2017, 1, 1)] + assert matrix_store.as_of_dates == [ + datetime.date(2016, 1, 1), + datetime.date(2017, 1, 1), + ] +@mock_s3 def test_s3_save(): - with mock_s3(): - client = boto3.client("s3") - client.create_bucket(Bucket="fake-matrix-bucket", ACL="public-read-write") - for example in matrix_stores(): - if not isinstance(example, CSVMatrixStore): - continue - project_storage = ProjectStorage("s3://fake-matrix-bucket") - - tosave = CSVMatrixStore(project_storage, [], "test") - tosave.metadata = example.metadata - tosave.matrix_label_tuple = example.matrix_label_tuple - tosave.save() - - tocheck = CSVMatrixStore(project_storage, [], "test") - assert tocheck.metadata == example.metadata - assert tocheck.design_matrix.to_dict() == example.design_matrix.to_dict() + client = boto3.client("s3") + client.create_bucket( + Bucket="fake-matrix-bucket", + ACL="public-read-write", + CreateBucketConfiguration={"LocationConstraint": "us-east-2"}, + ) + for example in matrix_stores(): + if not isinstance(example, CSVMatrixStore): + continue + project_storage = ProjectStorage("s3://fake-matrix-bucket") + + tosave = CSVMatrixStore(project_storage, [], "test") + tosave.metadata = example.metadata + tosave.matrix_label_tuple = example.matrix_label_tuple + tosave.save() + + tocheck = CSVMatrixStore(project_storage, [], "test") + assert tocheck.metadata == example.metadata + assert tocheck.design_matrix.to_dict() == example.design_matrix.to_dict() diff --git a/src/triage/component/catwalk/evaluation.py b/src/triage/component/catwalk/evaluation.py index b445393a1..f6f3540bb 100644 --- a/src/triage/component/catwalk/evaluation.py +++ b/src/triage/component/catwalk/evaluation.py @@ -1,6 +1,7 @@ import functools import itertools import verboselogs, logging + logger = verboselogs.VerboseLogger(__name__) import math @@ -22,7 +23,7 @@ db_retry, sort_predictions_and_labels, get_subset_table_name, - filename_friendly_hash + filename_friendly_hash, ) from triage.util.db import scoped_session from triage.util.random import generate_python_random_seed @@ -32,7 +33,6 @@ SORT_TRIALS = 30 - def subset_labels_and_predictions( subset_df, labels, @@ -61,7 +61,11 @@ def subset_labels_and_predictions( # The subset isn't specific to the cohort, so inner join to the labels/predictions labels_subset = labels.align(subset_df, join="inner")[0] predictions_subset = indexed_predictions.align(subset_df, join="inner")[0].values - protected_df_subset = protected_df if protected_df.empty else protected_df.align(subset_df, join="inner")[0] + protected_df_subset = ( + protected_df + if protected_df.empty + else protected_df.align(subset_df, join="inner")[0] + ) logger.spam( f"{len(labels_subset)} entities in subset out of {len(labels)} in matrix.", ) @@ -82,7 +86,9 @@ def query_subset_table(db_engine, as_of_dates, subset_table_name): active in the subset """ as_of_dates_sql = "[{}]".format( - ", ".join("'{}'".format(date.strftime("%Y-%m-%d %H:%M:%S.%f")) for date in as_of_dates) + ", ".join( + "'{}'".format(date.strftime("%Y-%m-%d %H:%M:%S.%f")) for date in as_of_dates + ) ) query_string = f""" with dates as ( @@ -92,8 +98,12 @@ def query_subset_table(db_engine, as_of_dates, subset_table_name): from {subset_table_name} join dates using(as_of_date) """ - df = pd.DataFrame.pg_copy_from(query_string, connectable=db_engine, parse_dates=["as_of_date"], - index_col=MatrixStore.indices) + df = pd.DataFrame.pg_copy_from( + query_string, + connectable=db_engine, + parse_dates=["as_of_date"], + index_col=MatrixStore.indices, + ) return df @@ -116,7 +126,9 @@ def generate_binary_at_x(test_predictions, x_value, unit="top_n"): else: cutoff_index = int(x_value) num_ones = cutoff_index if cutoff_index <= len_predictions else len_predictions - num_zeroes = len_predictions - cutoff_index if cutoff_index <= len_predictions else 0 + num_zeroes = ( + len_predictions - cutoff_index if cutoff_index <= len_predictions else 0 + ) test_predictions_binary = np.concatenate( (np.ones(num_ones, np.int8), np.zeros(num_zeroes, np.int8)) ) @@ -125,6 +137,7 @@ def generate_binary_at_x(test_predictions, x_value, unit="top_n"): class MetricDefinition(typing.NamedTuple): """A single metric, bound to a particular threshold and parameter combination""" + metric: str threshold_unit: str threshold_value: int @@ -137,6 +150,7 @@ class MetricEvaluationResult(typing.NamedTuple): The 'value' could represent the worst, best, or a random version of tiebreaking. """ + metric: str parameter: str value: float @@ -224,8 +238,7 @@ def _validate_metrics(self, custom_metrics): for name, met in custom_metrics.items(): if not hasattr(met, "greater_is_better"): raise ValueError( - f"Custom metric {name} missing greater_is_better " - f"attribute" + f"Custom metric {name} missing greater_is_better " f"attribute" ) elif met.greater_is_better not in (True, False): raise ValueError( @@ -317,7 +330,7 @@ def _flatten_metric_threshold( parameter_string=parameter_string, parameter_combination=parameter_combination, threshold_unit=threshold_unit, - threshold_value=threshold_value + threshold_value=threshold_value, ) metric_definitions.append(result) return metric_definitions @@ -390,7 +403,7 @@ def metric_definitions_from_matrix_type(self, matrix_type): else: return self._flatten_metric_config_groups(self.training_metric_groups) - def needs_evaluations(self, matrix_store, model_id, subset_hash=''): + def needs_evaluations(self, matrix_store, model_id, subset_hash=""): """Returns whether or not all the configured metrics are present in the database for the given matrix and model. Args: @@ -412,24 +425,31 @@ def needs_evaluations(self, matrix_store, model_id, subset_hash=''): # by querying the unique metrics and parameters relevant to the passed-in matrix session = self.sessionmaker() - evaluation_objects_in_db = session.query(eval_obj).filter_by( - model_id=model_id, - evaluation_start_time=matrix_store.as_of_dates[0], - evaluation_end_time=matrix_store.as_of_dates[-1], - as_of_date_frequency=matrix_store.metadata["as_of_date_frequency"], - subset_hash=subset_hash, - ).distinct(eval_obj.metric, eval_obj.parameter).all() + evaluation_objects_in_db = ( + session.query(eval_obj) + .filter_by( + model_id=model_id, + evaluation_start_time=matrix_store.as_of_dates[0], + evaluation_end_time=matrix_store.as_of_dates[-1], + as_of_date_frequency=matrix_store.metadata["as_of_date_frequency"], + subset_hash=subset_hash, + ) + .distinct(eval_obj.metric, eval_obj.parameter) + .all() + ) # The list of needed metrics and parameters are all the unique metric/params from the config # not present in the unique metric/params from the db evals_needed = bool( - {(met.metric, met.parameter_string) for met in metric_definitions} - - {(obj.metric, obj.parameter) for obj in evaluation_objects_in_db} + {(met.metric, met.parameter_string) for met in metric_definitions} + - {(obj.metric, obj.parameter) for obj in evaluation_objects_in_db} ) session.close() if evals_needed: - logger.notice(f"Needed evaluations for model {model_id} on matrix {matrix_store.uuid} are missing") + logger.notice( + f"Needed evaluations for model {model_id} on matrix {matrix_store.uuid} are missing" + ) return True # now check bias config if there @@ -453,8 +473,12 @@ def _compute_evaluations(self, predictions_proba, labels, metric_definitions): Returns: (list of MetricEvaluationResult objects) One result for each metric definition """ evals = [] - for (threshold_unit, threshold_value), metrics_for_threshold, in \ - itertools.groupby(metric_definitions, lambda m: (m.threshold_unit, m.threshold_value)): + for ( + (threshold_unit, threshold_value), + metrics_for_threshold, + ) in itertools.groupby( + metric_definitions, lambda m: (m.threshold_unit, m.threshold_value) + ): predicted_classes = generate_binary_at_x( predictions_proba, threshold_value, unit=threshold_unit ) @@ -463,7 +487,9 @@ def _compute_evaluations(self, predictions_proba, labels, metric_definitions): predicted_classes, labels ) num_labeled_examples = len(present_labels) - num_labeled_above_threshold = np.count_nonzero(predicted_classes_with_labels) + num_labeled_above_threshold = np.count_nonzero( + predicted_classes_with_labels + ) num_positive_labels = np.count_nonzero(present_labels) for metric_def in metrics_for_threshold: # using threshold configuration, convert probabilities to predicted classes @@ -500,7 +526,9 @@ def _compute_evaluations(self, predictions_proba, labels, metric_definitions): evals.append(result) return evals - def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None, subset=None): + def evaluate( + self, predictions_proba, matrix_store, model_id, protected_df=None, subset=None + ): """Evaluate a model based on predictions, and save the results Args: @@ -515,26 +543,32 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None, # If we are evaluating on a subset, we want to get just the labels and # predictions for the included entity-date pairs if subset: - logger.verbose(f"Subsetting labels and predictions of model {model_id} on matrix {matrix_store.uuid}") + logger.verbose( + f"Subsetting labels and predictions of model {model_id} on matrix {matrix_store.uuid}" + ) labels, predictions_proba, protected_df = subset_labels_and_predictions( - subset_df=query_subset_table( - self.db_engine, - matrix_store.as_of_dates, - get_subset_table_name(subset), - ), - predictions_proba=predictions_proba, - labels=matrix_store.labels, - protected_df=protected_df + subset_df=query_subset_table( + self.db_engine, + matrix_store.as_of_dates, + get_subset_table_name(subset), + ), + predictions_proba=predictions_proba, + labels=matrix_store.labels, + protected_df=protected_df, ) subset_hash = filename_friendly_hash(subset) else: - logger.debug(f"Using all the predictions of model {model_id} on matrix {matrix_store.uuid} for evaluation (i.e. no subset)") + logger.debug( + f"Using all the predictions of model {model_id} on matrix {matrix_store.uuid} for evaluation (i.e. no subset)" + ) labels = matrix_store.labels subset_hash = "" # confirm protected_df and labels have same set and count of values if (protected_df is not None) and (not protected_df.empty): - if (protected_df.index.shape != labels.index.shape) or (not protected_df.index.symmetric_difference(labels.index).empty): + if (protected_df.index.shape != labels.index.shape) or ( + not protected_df.index.symmetric_difference(labels.index).empty + ): raise ValueError("Mismatch between protected_df and labels indices") df_index = labels.index @@ -546,32 +580,46 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None, logger.spam(f"Found {len(metric_defs)} metric definitions total") # 1. get worst sorting - predictions_proba_worst, labels_worst, df_index_worst = sort_predictions_and_labels( + ( + predictions_proba_worst, + labels_worst, + df_index_worst, + ) = sort_predictions_and_labels( predictions_proba=predictions_proba, labels=labels, df_index=df_index, - tiebreaker='worst', + tiebreaker="worst", ) worst_lookup = { (eval.metric, eval.parameter): eval - for eval in - self._compute_evaluations(predictions_proba_worst, labels_worst, metric_defs) + for eval in self._compute_evaluations( + predictions_proba_worst, labels_worst, metric_defs + ) } - logger.debug(f'Predictions from {model_id} sorted by worst case scenario, i.e. all negative and NULL labels first') + logger.debug( + f"Predictions from {model_id} sorted by worst case scenario, i.e. all negative and NULL labels first" + ) # 2. get best sorting - predictions_proba_best, labels_best, df_index_best = sort_predictions_and_labels( + ( + predictions_proba_best, + labels_best, + df_index_best, + ) = sort_predictions_and_labels( predictions_proba=predictions_proba_worst, labels=labels_worst, df_index=df_index_worst, - tiebreaker='best', + tiebreaker="best", ) best_lookup = { (eval.metric, eval.parameter): eval - for eval in - self._compute_evaluations(predictions_proba_best, labels_best, metric_defs) + for eval in self._compute_evaluations( + predictions_proba_best, labels_best, metric_defs + ) } - logger.debug(f'Predictions from {model_id} sorted by best case scenario, i.e. all positive labels first, NULL labels at the end') + logger.debug( + f"Predictions from {model_id} sorted by best case scenario, i.e. all positive labels first, NULL labels at the end" + ) evals_without_trials = dict() @@ -581,8 +629,16 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None, for metric_def in metric_defs: worst_eval = worst_lookup[(metric_def.metric, metric_def.parameter_string)] best_eval = best_lookup[(metric_def.metric, metric_def.parameter_string)] - if worst_eval.value is None or best_eval.value is None or math.isclose(worst_eval.value, best_eval.value, rel_tol=RELATIVE_TOLERANCE): - evals_without_trials[(worst_eval.metric, worst_eval.parameter)] = worst_eval.value + if ( + worst_eval.value is None + or best_eval.value is None + or math.isclose( + worst_eval.value, best_eval.value, rel_tol=RELATIVE_TOLERANCE + ) + ): + evals_without_trials[ + (worst_eval.metric, worst_eval.parameter) + ] = worst_eval.value else: metric_defs_to_trial.append(metric_def) @@ -594,19 +650,23 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None, random_eval_accumulator = defaultdict(list) for _ in range(0, SORT_TRIALS): sort_seed = generate_python_random_seed() - predictions_proba_random, labels_random, df_index_random = sort_predictions_and_labels( + ( + predictions_proba_random, + labels_random, + df_index_random, + ) = sort_predictions_and_labels( predictions_proba=predictions_proba_worst, labels=labels_worst, df_index=df_index_worst, - tiebreaker='random', - sort_seed=sort_seed + tiebreaker="random", + sort_seed=sort_seed, ) for random_eval in self._compute_evaluations( - predictions_proba_random, - labels_random, - metric_defs_to_trial + predictions_proba_random, labels_random, metric_defs_to_trial ): - random_eval_accumulator[(random_eval.metric, random_eval.parameter)].append(random_eval.value) + random_eval_accumulator[ + (random_eval.metric, random_eval.parameter) + ].append(random_eval.value) # 5. flatten best, worst, stochastic results for each metric definition # into database records @@ -622,16 +682,30 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None, standard_deviation = 0 num_sort_trials = 0 else: - trial_results = [value for value in random_eval_accumulator[metric_key] if value is not None] + trial_results = [ + value + for value in random_eval_accumulator[metric_key] + if value is not None + ] stochastic_value = statistics.mean(trial_results) - standard_deviation = statistics.stdev(trial_results) + try: + standard_deviation = statistics.stdev(trial_results) + except ValueError: + logger.warning( + f"{metric_def.metric} not defined for parameter {metric_def.parameter_combination} because all values " + "are NaN. Inserting NULL for value." + ) + standard_deviation = None + num_sort_trials = len(trial_results) evaluation = matrix_type.evaluation_obj( metric=metric_def.metric, parameter=metric_def.parameter_string, num_labeled_examples=worst_lookup[metric_key].num_labeled_examples, - num_labeled_above_threshold=worst_lookup[metric_key].num_labeled_above_threshold, + num_labeled_above_threshold=worst_lookup[ + metric_key + ].num_labeled_above_threshold, num_positive_labels=worst_lookup[metric_key].num_positive_labels, worst_value=worst_lookup[metric_key].value, best_value=best_lookup[metric_key].value, @@ -657,23 +731,25 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None, protected_df=protected_df.reindex(df_index_worst), predictions_proba=predictions_proba_worst, labels=labels_worst, - tie_breaker='worst', + tie_breaker="worst", subset_hash=subset_hash, matrix_type=matrix_type, evaluation_start_time=evaluation_start_time, evaluation_end_time=evaluation_end_time, - matrix_uuid=matrix_store.uuid) + matrix_uuid=matrix_store.uuid, + ) self._write_audit_to_db( model_id=model_id, protected_df=protected_df.reindex(df_index_best), predictions_proba=predictions_proba_best, labels=labels_best, - tie_breaker='best', + tie_breaker="best", subset_hash=subset_hash, matrix_type=matrix_type, evaluation_start_time=evaluation_start_time, evaluation_end_time=evaluation_end_time, - matrix_uuid=matrix_store.uuid) + matrix_uuid=matrix_store.uuid, + ) def _write_audit_to_db( self, @@ -686,7 +762,7 @@ def _write_audit_to_db( matrix_type, evaluation_start_time, evaluation_end_time, - matrix_uuid + matrix_uuid, ): """ Runs the bias audit and saves the result in the bias table. @@ -716,58 +792,69 @@ def _write_audit_to_db( # score, label value, model_id, protected attributes # fill out the protected_df, which just has protected attributes at this point protected_df = protected_df.copy() - protected_df['model_id'] = model_id - protected_df['score'] = predictions_proba - protected_df['label_value'] = labels + protected_df["model_id"] = model_id + protected_df["score"] = predictions_proba + protected_df["label_value"] = labels aequitas_df, attr_cols_input = preprocess_input_df(protected_df) # create group crosstabs g = Group() score_thresholds = {} - score_thresholds['rank_abs'] = self.bias_config['thresholds'].get('top_n', []) + score_thresholds["rank_abs"] = self.bias_config["thresholds"].get("top_n", []) # convert 0-100 percentile to 0-1 that Aequitas expects - score_thresholds['rank_pct'] = [value / 100.0 for value in self.bias_config['thresholds'].get('percentiles', [])] - groups_model, attr_cols = g.get_crosstabs(aequitas_df, - score_thresholds=score_thresholds, - attr_cols=attr_cols_input) + score_thresholds["rank_pct"] = [ + value / 100.0 + for value in self.bias_config["thresholds"].get("percentiles", []) + ] + groups_model, attr_cols = g.get_crosstabs( + aequitas_df, score_thresholds=score_thresholds, attr_cols=attr_cols_input + ) # analyze bias from reference groups bias = Bias() - ref_groups_method = self.bias_config.get('ref_groups_method', None) - if ref_groups_method == 'predefined' and self.bias_config['ref_groups']: - bias_df = bias.get_disparity_predefined_groups(groups_model, aequitas_df, self.bias_config['ref_groups']) - elif ref_groups_method == 'majority': + ref_groups_method = self.bias_config.get("ref_groups_method", None) + if ref_groups_method == "predefined" and self.bias_config["ref_groups"]: + bias_df = bias.get_disparity_predefined_groups( + groups_model, aequitas_df, self.bias_config["ref_groups"] + ) + elif ref_groups_method == "majority": bias_df = bias.get_disparity_major_group(groups_model, aequitas_df) else: bias_df = bias.get_disparity_min_metric(groups_model, aequitas_df) # analyze fairness for each group - f = Fairness(tau=0.8) # the default fairness threshold is 0.8 + f = Fairness(tau=0.8) # the default fairness threshold is 0.8 group_value_df = f.get_group_value_fairness(bias_df) - group_value_df['subset_hash'] = subset_hash - group_value_df['tie_breaker'] = tie_breaker - group_value_df['evaluation_start_time'] = evaluation_start_time - group_value_df['evaluation_end_time'] = evaluation_end_time - group_value_df['matrix_uuid'] = matrix_uuid - group_value_df = group_value_df.rename(index=str, columns={"score_threshold": "parameter", "for": "for_"}) + group_value_df["subset_hash"] = subset_hash + group_value_df["tie_breaker"] = tie_breaker + group_value_df["evaluation_start_time"] = evaluation_start_time + group_value_df["evaluation_end_time"] = evaluation_end_time + group_value_df["matrix_uuid"] = matrix_uuid + group_value_df = group_value_df.rename( + index=str, columns={"score_threshold": "parameter", "for": "for_"} + ) if group_value_df.empty: - raise ValueError(f""" + raise ValueError( + f""" Bias audit: aequitas_audit() failed. Returned empty dataframe for model_id = {model_id}, and subset_hash = {subset_hash} - and matrix_type = {matrix_type}""") + and matrix_type = {matrix_type}""" + ) with scoped_session(self.db_engine) as session: for index, row in group_value_df.iterrows(): session.query(matrix_type.aequitas_obj).filter_by( - model_id=row['model_id'], - evaluation_start_time=row['evaluation_start_time'], - evaluation_end_time=row['evaluation_end_time'], - subset_hash=row['subset_hash'], - parameter=row['parameter'], - tie_breaker=row['tie_breaker'], - matrix_uuid=row['matrix_uuid'], - attribute_name=row['attribute_name'], - attribute_value=row['attribute_value'] + model_id=row["model_id"], + evaluation_start_time=row["evaluation_start_time"], + evaluation_end_time=row["evaluation_end_time"], + subset_hash=row["subset_hash"], + parameter=row["parameter"], + tie_breaker=row["tie_breaker"], + matrix_uuid=row["matrix_uuid"], + attribute_name=row["attribute_name"], + attribute_value=row["attribute_value"], ).delete() - session.bulk_insert_mappings(matrix_type.aequitas_obj, group_value_df.to_dict(orient="records")) + session.bulk_insert_mappings( + matrix_type.aequitas_obj, group_value_df.to_dict(orient="records") + ) @db_retry def _write_to_db( @@ -805,7 +892,7 @@ def _write_to_db( evaluation_start_time=evaluation_start_time, evaluation_end_time=evaluation_end_time, as_of_date_frequency=as_of_date_frequency, - subset_hash=subset_hash + subset_hash=subset_hash, ).delete() for evaluation in evaluations: diff --git a/src/triage/component/postmodeling/crosstabs.py b/src/triage/component/postmodeling/crosstabs.py index 6ab79bf85..5cc67d742 100644 --- a/src/triage/component/postmodeling/crosstabs.py +++ b/src/triage/component/postmodeling/crosstabs.py @@ -1,5 +1,7 @@ +import ohio.ext.pandas import pandas as pd import verboselogs, logging + logger = verboselogs.VerboseLogger(__name__) from scipy import stats @@ -42,9 +44,11 @@ def __init__(self, config_path=None, config=None): def hr_lr_ttest(hr, lr): - """ Returns the t-test (T statistic and p value), comparing the features for - high- and low-risk entities. """ - res = stats.ttest_ind(hr.to_numpy(), lr.to_numpy(), axis=0, nan_policy="omit", equal_var=False) + """Returns the t-test (T statistic and p value), comparing the features for + high- and low-risk entities.""" + res = stats.ttest_ind( + hr.to_numpy(), lr.to_numpy(), axis=0, nan_policy="omit", equal_var=False + ) r0 = pd.Series(res[0], index=hr.columns) r1 = pd.Series(res[1], index=hr.columns) @@ -84,43 +88,53 @@ def populate_crosstabs_table( schema="results", table="crosstabs", ): - """ This function populates the results.crosstabs table. - - Args: - entity_id_list (list): A list of entity_ids. Crosstabs will - be calculated using only predictions and - features of these entities. This list is - being passed to get_predictions_query and - get_features_query. - crosstab_functions (dict): A dictionary of function names and - functions. The functions receive the feature - dataframes for entities that were predicted - postive or negative, respectively. Each function - returns a Series or DataFrame with rows now - indexed by the feature names, and columns - corresponding to the statistics that the - function calculated. - thresholds (dict): A dictionary that maps column names to lists of - threshold values. The (name, threshold) pairs will - be applied to the dataframes with predictions to - split low- and high-risk entities. - push_to_db (bool): If True, the crosstab table is being populated. - return_df (bool): If True, a dataframe corresponding to the crosstabe - table will be returned. - engine (SQLAlchemy engine): DB connection - schema (str): Name of the schema to store results; defaults to 'results' - table (str): Name of table to store results; defaults to 'crosstabs' - - Returns: A DataFrame, corresponding to results.crosstabs + """This function populates the results.crosstabs table. + + Args: + entity_id_list (list): A list of entity_ids. Crosstabs will + be calculated using only predictions and + features of these entities. This list is + being passed to get_predictions_query and + get_features_query. + crosstab_functions (dict): A dictionary of function names and + functions. The functions receive the feature + dataframes for entities that were predicted + postive or negative, respectively. Each function + returns a Series or DataFrame with rows now + indexed by the feature names, and columns + corresponding to the statistics that the + function calculated. + thresholds (dict): A dictionary that maps column names to lists of + threshold values. The (name, threshold) pairs will + be applied to the dataframes with predictions to + split low- and high-risk entities. + push_to_db (bool): If True, the crosstab table is being populated. + return_df (bool): If True, a dataframe corresponding to the crosstabe + table will be returned. + engine (SQLAlchemy engine): DB connection + schema (str): Name of the schema to store results; defaults to 'results' + table (str): Name of table to store results; defaults to 'crosstabs' + + Returns: A DataFrame, corresponding to results.crosstabs """ - print("\n\n****************\nRUNNING populate_crosstabs_table for model_id ", model_id, "and as_of_date ", as_of_date) + logger.info( + f"RUNNING populate_crosstabs_table for model_id {model_id} and as_of_date {as_of_date}", + ) if len(df) == 0: raise ValueError("No data could be fetched.") dfs = [] # this will be useful later - the non-feature columns we'll grab - model_cols = ["model_id", "as_of_date", "entity_id", "score", "rank_abs", "rank_pct", "label_value"] + model_cols = [ + "model_id", + "as_of_date", + "entity_id", + "score", + "rank_abs", + "rank_pct", + "label_value", + ] # drop the all-null columns null_cols = df.columns[df.isnull().sum() == df.shape[0]] @@ -132,27 +146,31 @@ def populate_crosstabs_table( # if the dataframe isn't dummified, do it if object in df.dtypes.values or pd.Categorical.dtype in df.dtypes.values: - to_dummify = [c for c in df.select_dtypes(include=["category", object]) if c not in model_cols] - print("Dummifying the data") + to_dummify = [ + c + for c in df.select_dtypes(include=["category", object]) + if c not in model_cols + ] + logger.debug("Dummifying the data") df = pd.get_dummies(df, columns=to_dummify, dummy_na=True) feat_cols = [c for c in df.columns if c not in model_cols] - print("Iterating over thresholds to generate results...") + logger.debug("Iterating over thresholds to generate results...") for thres_type, thress in thresholds.items(): for thres in thress: results = pd.DataFrame(index=feat_cols) - print("split dataframe in high risk and lowriks") + logger.debug("split dataframe in high risk and lowriks") # split dataframe into high/low risk df_pred_pos = df.loc[df[thres_type] <= thres, feat_cols] df_pred_neg = df.loc[df[thres_type] > thres, feat_cols] - print("len of hr and lr", len(df_pred_pos), len(df_pred_neg)) + logger.debug(f"len of hr: {len(df_pred_pos)} and lr: {len(df_pred_neg)}") for name, func in crosstab_functions: - print(name) + logger.debug(name) this_result = pd.DataFrame(func(df_pred_pos, df_pred_neg)) if name in ["ttest_T", "ttest_p"]: - print("this_result:", this_result.shape) - print("Results:", results.shape) + logger.debug("this_result:", this_result.shape) + logger.debug("Results:", results.shape) if not type(name) in [list, tuple]: name = [name] # the metric name is coming from the crosstab_functions tuples @@ -173,8 +191,10 @@ def populate_crosstabs_table( df = pd.concat(dfs) if push_to_db: - print("Pushing results to database...") - df.reset_index().set_index(["model_id", "as_of_date", "metric"]).to_sql(schema=schema, name=table, con=engine, if_exists="append") + logger.debug("Pushing results to database...") + df.reset_index().set_index(["model_id", "as_of_date", "metric"]).pg_copy_to( + schema=schema, name=table, con=engine, if_exists="append" + ) if return_df: return df @@ -202,7 +222,9 @@ def run_crosstabs(db_engine, crosstabs_config): predictions_query=crosstabs_config.predictions_query, ) if len(crosstabs_config.entity_id_list) > 0: - crosstabs_query += " where entity_id=ANY('{%s}') " % ", ".join(map(str, crosstabs_config.entity_id_list)) + crosstabs_query += " where entity_id=ANY('{%s}') " % ", ".join( + map(str, crosstabs_config.entity_id_list) + ) crosstabs_query += " order by model_id, as_of_date, rank_abs asc;" df = pd.read_sql(crosstabs_query, db_engine) if len(df) == 0: diff --git a/src/triage/experiments/base.py b/src/triage/experiments/base.py index 4bb7aa169..f48e05410 100644 --- a/src/triage/experiments/base.py +++ b/src/triage/experiments/base.py @@ -151,6 +151,7 @@ def __init__( self._check_config_version(config) self.config = config + if self.config.get("cohort_config") is not None: self.config["cohort_config"] = load_query_if_needed( self.config["cohort_config"] @@ -897,7 +898,7 @@ def train_and_test_models(self): experiment.models_needed = len(model_hashes) record_model_building_started(self.run_id, self.db_engine) self.process_train_test_batches(batches) - logger.success("Training, testing and evaluatiog models completed") + logger.success("Training, testing and evaluating models completed") def validate(self, strict=True): ExperimentValidator(self.db_engine, strict=strict).run(self.config)