diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
index ec1e5b948..bfd151b8f 100644
--- a/.github/workflows/publish-to-pypi.yml
+++ b/.github/workflows/publish-to-pypi.yml
@@ -8,10 +8,10 @@ jobs:
     runs-on: ubuntu-18.04
     steps:
     - uses: actions/checkout@master
-    - name: Set up Python 3.7
+    - name: Set up Python 3.8
       uses: actions/setup-python@v1
       with:
-        python-version: 3.7
+        python-version: 3.8
     - name: Install pypa/build
       run: >-
         python -m
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 78f9f7772..5d22fb7f8 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -22,7 +22,7 @@ jobs:
           --health-retries 5
     strategy:
       matrix:
-        python-version: [3.7, 3.8, 3.9]
+        python-version: ['3.8', '3.9', '3.10']
 
     steps:
     - uses: actions/checkout@v2
diff --git a/.travis.yml b/.travis.yml
index 0d1f0677a..d57db252d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,6 +1,6 @@
 dist: bionic
 language: python
-python: ["3.6", "3.7"]
+python: ["3.8", "3.9", "3.10"]
 addons:
   postgresql: '11'
   apt:
diff --git a/Dockerfile b/Dockerfile
index 4665f1919..aaacb20af 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.7-slim AS development
+FROM python:3.8-slim AS development
 
 LABEL creator="Center for Data Science and Public Policy (DSaPP)" \
         maintainer="Adolfo De Unánue <adolfo@cmu.edu>" \
@@ -60,7 +60,7 @@ RUN pip install -e .
 
 ENTRYPOINT [ "bash" ]
 
-FROM python:3.7-slim AS master
+FROM python:3.8-slim AS master
 
 LABEL triage.version="master"
 
diff --git a/README.md b/README.md
index 8aac62da0..3d985389f 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ Triage is designed to:
 
 To install Triage, you need:
 
-- Python 3.7+
+- Python 3.8+
 - A PostgreSQL 9.6+ database with your source data (events,
   geographical data, etc) loaded.
   - **NOTE**: If your database is PostgreSQL 11+ you will get some
diff --git a/docs/sources/dirtyduck/dirty_duckling.md b/docs/sources/dirtyduck/dirty_duckling.md
index 7ca9fa5bf..5d1fadaa9 100644
--- a/docs/sources/dirtyduck/dirty_duckling.md
+++ b/docs/sources/dirtyduck/dirty_duckling.md
@@ -242,7 +242,7 @@ triage experiment experiments/dirty-duckling.yaml
 ```
 That's it! If you see this message in your screen:
 
-     2020-08-20 16:56:56 - SUCCESS Training, testing and evaluatiog models completed
+     2020-08-20 16:56:56 - SUCCESS Training, testing and evaluating models completed
      2020-08-20 16:56:56 - SUCCESS All matrices that were supposed to be build were built. Awesome!
      2020-08-20 16:56:56 - SUCCESS All models that were supposed to be trained were trained. Awesome!
      2020-08-20 16:56:56 - SUCCESS Experiment (a336de4800cec8964569d051dc56f85d) ran through completion
diff --git a/docs/sources/experiments/running.md b/docs/sources/experiments/running.md
index e71e97de9..ecc76a584 100644
--- a/docs/sources/experiments/running.md
+++ b/docs/sources/experiments/running.md
@@ -4,7 +4,7 @@
 
 To use a Triage experiment, you first need:
 
-- Python 3.7+
+- Python 3.8+
 - A PostgreSQL (v9.6+) database with your source data (events, geographical data, etc) loaded.
 - Ample space on an available disk (or S3) to store the needed matrices and models for your experiment
 - An experiment definition (see [Experiment configuration](experiment-config.md))
diff --git a/docs/sources/index.md b/docs/sources/index.md
index 680a150a4..a2c1f04c4 100644
--- a/docs/sources/index.md
+++ b/docs/sources/index.md
@@ -25,7 +25,7 @@ Triage is designed to:
 
 To install Triage, you need:
 
-- Python 3.7+
+- Python 3.8+
 - A PostgreSQL 9.6+ database with your source data (events,
   geographical data, etc) loaded.
   - **NOTE**: If your database is PostgreSQL 11+ you will get some
diff --git a/docs/sources/quickstart.md b/docs/sources/quickstart.md
index 63cc27957..84a5157ff 100644
--- a/docs/sources/quickstart.md
+++ b/docs/sources/quickstart.md
@@ -137,7 +137,7 @@ defaults for others. The primary parameters to specify (for now) are:
    - `entity_id`: each `entity_id` affected by an event within the amount of time specified by `label_timespan` after a given `as_of_date`
    - `outcome`: a binary variable representing the events that happened to each entity, within the period specified by that `as_of_date` and `label_timespan`
 
-   The query is parameterized over `as_of_date`, and `label_timespan`. These parameters are passed to your query as named keywords using the Python's [`str.format()`](https://docs.python.org/3.7/library/stdtypes.html#str.format) method. You can use them in your query by surrounding their keywords with curly braces (as in the example below).
+   The query is parameterized over `as_of_date`, and `label_timespan`. These parameters are passed to your query as named keywords using the Python's [`str.format()`](https://docs.python.org/3.8/library/stdtypes.html#str.format) method. You can use them in your query by surrounding their keywords with curly braces (as in the example below).
    
    See our
    [guide to Labels](https://dssg.github.io/triage/experiments/cohort-labels/) for a more in-depth discussion of this topic.
diff --git a/requirement/dev.txt b/requirement/dev.txt
index b117eb353..45f5418a4 100644
--- a/requirement/dev.txt
+++ b/requirement/dev.txt
@@ -1,7 +1,7 @@
 -r include/build.txt
 bumpversion==0.6.0
 mkdocs==1.3.0
-pymdown-extensions==7.1
-mkdocs-material==7.0.6
-mkdocstrings==0.12.0
-black==19.10b0
+pymdown-extensions==9.4
+mkdocs-material==8.2.12
+mkdocstrings==0.18.1
+black==22.3.0
diff --git a/requirement/include/build.txt b/requirement/include/build.txt
index b82d21b37..e75560c7c 100644
--- a/requirement/include/build.txt
+++ b/requirement/include/build.txt
@@ -1 +1 @@
-wheel==0.34.2
+wheel==0.37.1
diff --git a/requirement/include/lint.txt b/requirement/include/lint.txt
index d0acd85e8..ec6112141 100644
--- a/requirement/include/lint.txt
+++ b/requirement/include/lint.txt
@@ -1 +1 @@
-flake8==3.8.3
+flake8==4.0.1
diff --git a/requirement/include/test-management.txt b/requirement/include/test-management.txt
index 9ed8460c3..b4b6c778c 100644
--- a/requirement/include/test-management.txt
+++ b/requirement/include/test-management.txt
@@ -1,3 +1,3 @@
-codecov==2.1.7
+codecov==2.1.12
 coverage>=4.4
-tox==3.16.1
+tox==3.25.0
diff --git a/requirement/main.txt b/requirement/main.txt
index b074802e2..d4044d746 100644
--- a/requirement/main.txt
+++ b/requirement/main.txt
@@ -1,31 +1,31 @@
-alembic==1.4.2
-SQLAlchemy==1.3.18
-PyYAML==5.4.1
-psycopg2-binary==2.8.5
-python-dateutil==2.8.1
-boto3==1.14.45
-click==7.1.2
-inflection==0.5.0
-numpy==1.21.1
+alembic==1.7.7
+SQLAlchemy==1.3.18 # pyup: ignore
+PyYAML==6.0
+psycopg2-binary==2.9.3
+python-dateutil==2.8.2
+boto3==1.22.4
+click==8.1.3
+inflection==0.5.1
+numpy==1.22.3
 sqlalchemy-postgres-copy==0.5.0
 retrying==1.3.3
 Dickens==1.0.1
 signalled-timeout==1.0.0
-s3fs==0.4.2
-wrapt==1.13.3
+wrapt==1.14.0
 argcmdr==0.7.0
 sqlparse==0.4.2
-pebble==4.5.3
+pebble==4.6.3
 adjustText==0.7.3
-graphviz==0.14
-requests==2.24.0
-coloredlogs==14.0
+graphviz==0.20
+requests==2.27.1
+coloredlogs==15.0.1
 verboselogs==1.7
-scipy==1.5.0
-scikit-learn==0.23.1
-matplotlib==3.3.4
-pandas==1.0.5
-seaborn==0.10.1
+s3fs==0.4.2 # pyup: ignore
+scipy==1.8.0
+scikit-learn==1.0.2
+matplotlib==3.5.1
+pandas==1.3.5 # pyup: ignore
+seaborn==0.11.2
 ohio==0.5.0
 
 
diff --git a/requirement/test.txt b/requirement/test.txt
index 4e91b1e2f..0ffe10b34 100644
--- a/requirement/test.txt
+++ b/requirement/test.txt
@@ -1,11 +1,11 @@
 -r include/lint.txt
 -r include/test-management.txt
 parsedatetime==2.6
-csvkit==1.0.5
-factory_boy==2.12.0
+csvkit==1.0.7
+factory_boy==3.2.1
 testing.postgresql==1.3.0
-pytest==5.4.3 #<4.0.0 # pyup: ignore
-pytest-cov==2.10.0
-moto==1.3.14
-fakeredis==1.4.1
-hypothesis==5.19.0
+pytest==6.2.5 #<4.0.0 # pyup: ignore
+pytest-cov==3.0.0
+moto==3.1.7
+fakeredis==1.7.1
+hypothesis==6.46.1
diff --git a/setup.py b/setup.py
index 9e768aa86..7a649114f 100755
--- a/setup.py
+++ b/setup.py
@@ -7,15 +7,15 @@
 
 ROOT_PATH = Path(__file__).parent
 
-LICENSE_PATH = ROOT_PATH / 'LICENSE'
+LICENSE_PATH = ROOT_PATH / "LICENSE"
 
-README_PATH = ROOT_PATH / 'README.md'
+README_PATH = ROOT_PATH / "README.md"
 
-REQUIREMENTS_PATH = ROOT_PATH / 'requirement' / 'main.txt'
+REQUIREMENTS_PATH = ROOT_PATH / "requirement" / "main.txt"
 
-REQUIREMENTS_TEST_PATH = ROOT_PATH / 'requirement' / 'test.txt'
+REQUIREMENTS_TEST_PATH = ROOT_PATH / "requirement" / "test.txt"
 
-REQUIREMENTS_RQ_PATH = ROOT_PATH / 'requirement' / 'extras-rq.txt'
+REQUIREMENTS_RQ_PATH = ROOT_PATH / "requirement" / "extras-rq.txt"
 
 
 def stream_requirements(fd):
@@ -25,8 +25,8 @@ def stream_requirements(fd):
 
     """
     for line in fd:
-        cleaned = re.sub(r'#.*$', '', line).strip()
-        if cleaned and not cleaned.startswith('-r'):
+        cleaned = re.sub(r"#.*$", "", line).strip()
+        if cleaned and not cleaned.startswith("-r"):
             yield cleaned
 
 
@@ -43,40 +43,41 @@ def stream_requirements(fd):
 
 
 setup(
-    name='triage',
-    version='5.1.1',
+    name="triage",
+    version="5.1.1",
     description="Risk modeling and prediction",
     long_description=README_PATH.read_text(),
     long_description_content_type="text/markdown",
     author="Center for Data Science and Public Policy",
-    author_email='datascifellows@gmail.com',
+    author_email="datascifellows@gmail.com",
     url="https://dssg.github.io/triage/",
     project_urls={
         "Documentation": "https://dssg.github.io/triage/",
         "Source Code": "https://github.com/dssg/triage",
-        "Tutorial": "https://dssg.github.io/triage/dirtyduck/"
+        "Tutorial": "https://dssg.github.io/triage/dirtyduck/",
     },
-    packages=find_packages('src', exclude=['tests', 'tests.*']),
-    package_dir={'': 'src'},
+    packages=find_packages("src", exclude=["tests", "tests.*"]),
+    package_dir={"": "src"},
     include_package_data=True,
     install_requires=REQUIREMENTS,
     entry_points={
-        'console_scripts': ['triage = triage.cli:execute'],
+        "console_scripts": ["triage = triage.cli:execute"],
     },
-    extras_require={'rq': RQ_REQUIREMENTS},
+    extras_require={"rq": RQ_REQUIREMENTS},
     license="MIT License",
     zip_safe=False,
-    keywords='triage',
+    keywords="triage",
     classifiers=[
-        'Development Status :: 2 - Pre-Alpha',
-        'Intended Audience :: Developers',
-        'License :: OSI Approved :: MIT License',
-        'Natural Language :: English',
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7',
+        "Development Status :: 2 - Pre-Alpha",
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: MIT License",
+        "Natural Language :: English",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
     ],
-    python_requires='>=3.6',
-    test_suite='tests',
-    tests_require=REQUIREMENTS_TEST
+    python_requires=">=3.8",
+    test_suite="tests",
+    tests_require=REQUIREMENTS_TEST,
 )
diff --git a/src/tests/architect_tests/test_builders.py b/src/tests/architect_tests/test_builders.py
index 4818f3a69..005f27c54 100644
--- a/src/tests/architect_tests/test_builders.py
+++ b/src/tests/architect_tests/test_builders.py
@@ -3,6 +3,8 @@
 
 import pandas as pd
 import testing.postgresql
+from unittest.mock import Mock
+
 from triage import create_engine
 from contextlib import contextmanager
 
@@ -421,11 +423,11 @@ def test_load_features_data():
         cols = ["entity_id", "as_of_date"] + features[i]
         temp_df = pd.DataFrame(table, columns=cols)
         temp_df["as_of_date"] = convert_string_column_to_date(temp_df["as_of_date"])
-        features_dfs.append(
-            ids_dates.merge(
-                right=temp_df, how="left", on=["entity_id", "as_of_date"]
-            ).set_index(["entity_id", "as_of_date"])
+        merged_df = ids_dates.merge(
+            right=temp_df, how="left", on=["entity_id", "as_of_date"]
         )
+        merged_df["as_of_date"] = pd.to_datetime(merged_df["as_of_date"])
+        features_dfs.append(merged_df.set_index(["entity_id", "as_of_date"]))
 
     # create an engine and generate a table with fake feature data
     with testing.postgresql.Postgresql() as postgresql:
diff --git a/src/tests/catwalk_tests/test_evaluation.py b/src/tests/catwalk_tests/test_evaluation.py
index ea2ee8a08..3f193560d 100644
--- a/src/tests/catwalk_tests/test_evaluation.py
+++ b/src/tests/catwalk_tests/test_evaluation.py
@@ -91,8 +91,7 @@ def populate_subset_data(db_engine, subset, entity_ids, as_of_date=TRAIN_END_TIM
             from unfiltered_row
             {query_where_clause}
             """
-        db_engine.execute(
-            text(insert_query).execution_options(autocommit=True))
+        db_engine.execute(text(insert_query).execution_options(autocommit=True))
 
 
 def test_all_same_labels(db_engine_with_results_schema):
@@ -128,13 +127,14 @@ def test_all_same_labels(db_engine_with_results_schema):
                     "entity_id": list(range(num_entities)),
                     "as_of_date": [TRAIN_END_TIME] * num_entities,
                 }
-            ).set_index(["entity_id", "as_of_date"]).label_value,
+            )
+            .set_index(["entity_id", "as_of_date"])
+            .label_value,
             init_as_of_dates=[TRAIN_END_TIME],
         )
 
         model_evaluator.evaluate(
-            trained_model.predict_proba(
-                labels)[:, 1], fake_matrix_store, model_id
+            trained_model.predict_proba(labels)[:, 1], fake_matrix_store, model_id
         )
 
         for metric, best, worst, stochastic in db_engine_with_results_schema.execute(
@@ -143,10 +143,7 @@ def test_all_same_labels(db_engine_with_results_schema):
             where model_id = %s and
             evaluation_start_time = %s
             order by 1""",
-            (
-                model_id,
-                fake_matrix_store.as_of_dates[0]
-            ),
+            (model_id, fake_matrix_store.as_of_dates[0]),
         ):
             if metric == "accuracy":
                 assert best is not None
@@ -174,7 +171,9 @@ def test_subset_labels_and_predictions(db_engine_with_results_schema):
                 "entity_id": list(range(num_entities)),
                 "as_of_date": [TRAIN_END_TIME] * num_entities,
             }
-        ).set_index(["entity_id", "as_of_date"]).label_value,
+        )
+        .set_index(["entity_id", "as_of_date"])
+        .label_value,
         init_as_of_dates=[TRAIN_END_TIME],
     )
 
@@ -186,9 +185,14 @@ def test_subset_labels_and_predictions(db_engine_with_results_schema):
         elif subset["name"] == "empty":
             expected_result = 0
 
-        populate_subset_data(db_engine_with_results_schema,
-                             subset, list(range(num_entities)))
-        subset_labels, subset_predictions, subset_protected_df = subset_labels_and_predictions(
+        populate_subset_data(
+            db_engine_with_results_schema, subset, list(range(num_entities))
+        )
+        (
+            subset_labels,
+            subset_predictions,
+            subset_protected_df,
+        ) = subset_labels_and_predictions(
             subset_df=query_subset_table(
                 db_engine_with_results_schema,
                 fake_matrix_store.as_of_dates,
@@ -229,8 +233,7 @@ def test_evaluating_early_warning(db_engine_with_results_schema):
                 "average precision score",
             ]
         },
-        {"metrics": ["fbeta@"],
-            "parameters": [{"beta": 0.75}, {"beta": 1.25}]},
+        {"metrics": ["fbeta@"], "parameters": [{"beta": 0.75}, {"beta": 1.25}]},
     ]
 
     training_metric_groups = [{"metrics": ["accuracy", "roc_auc"]}]
@@ -256,7 +259,9 @@ def test_evaluating_early_warning(db_engine_with_results_schema):
                 "entity_id": list(range(num_entities)),
                 "as_of_date": [TRAIN_END_TIME] * num_entities,
             }
-        ).set_index(["entity_id", "as_of_date"]).label_value,
+        )
+        .set_index(["entity_id", "as_of_date"])
+        .label_value,
         init_as_of_dates=[TRAIN_END_TIME],
     )
     fake_train_matrix_store = MockMatrixStore(
@@ -270,7 +275,9 @@ def test_evaluating_early_warning(db_engine_with_results_schema):
                 "entity_id": list(range(num_entities)),
                 "as_of_date": [TRAIN_END_TIME] * num_entities,
             }
-        ).set_index(["entity_id", "as_of_date"]).label_value,
+        )
+        .set_index(["entity_id", "as_of_date"])
+        .label_value,
         init_as_of_dates=[TRAIN_END_TIME],
     )
 
@@ -282,14 +289,15 @@ def test_evaluating_early_warning(db_engine_with_results_schema):
     # ensure that the matrix uuid is present
     matrix_uuids = [
         row[0]
-        for row in db_engine_with_results_schema.execute("select matrix_uuid from test_results.evaluations")
+        for row in db_engine_with_results_schema.execute(
+            "select matrix_uuid from test_results.evaluations"
+        )
     ]
     assert all(matrix_uuid == "efgh" for matrix_uuid in matrix_uuids)
 
     # Evaluate the training metrics and test
     model_evaluator.evaluate(
-        trained_model.predict_proba(
-            labels)[:, 1], fake_train_matrix_store, model_id
+        trained_model.predict_proba(labels)[:, 1], fake_train_matrix_store, model_id
     )
     records = [
         row[0]
@@ -309,19 +317,18 @@ def test_evaluating_early_warning(db_engine_with_results_schema):
         if subset is None:
             where_hash = ""
         else:
-            populate_subset_data(db_engine_with_results_schema,
-                                 subset, list(range(num_entities)))
+            populate_subset_data(
+                db_engine_with_results_schema, subset, list(range(num_entities))
+            )
             SubsetFactory(subset_hash=filename_friendly_hash(subset))
             session.commit()
             where_hash = f"and subset_hash = '{filename_friendly_hash(subset)}'"
-
         # Evaluate the testing metrics and test for all of them.
         model_evaluator.evaluate(
             trained_model.predict_proba(labels)[:, 1],
             fake_test_matrix_store,
             model_id,
             subset=subset,
-
         )
 
         records = [
@@ -335,10 +342,7 @@ def test_evaluating_early_warning(db_engine_with_results_schema):
                 {where_hash}
                 order by 1
                 """,
-                (
-                    model_id,
-                    fake_test_matrix_store.as_of_dates[0]
-                ),
+                (model_id, fake_test_matrix_store.as_of_dates[0]),
             )
         ]
         assert records == [
@@ -392,10 +396,7 @@ def test_evaluating_early_warning(db_engine_with_results_schema):
                 evaluation_start_time = %s
                 {where_hash}
                 order by 1""",
-                (
-                    model_id,
-                    fake_train_matrix_store.as_of_dates[0]
-                ),
+                (model_id, fake_train_matrix_store.as_of_dates[0]),
             )
         ]
         assert records == ["accuracy", "roc_auc"]
@@ -403,7 +404,9 @@ def test_evaluating_early_warning(db_engine_with_results_schema):
     # ensure that the matrix uuid is present
     matrix_uuids = [
         row[0]
-        for row in db_engine_with_results_schema.execute("select matrix_uuid from train_results.evaluations")
+        for row in db_engine_with_results_schema.execute(
+            "select matrix_uuid from train_results.evaluations"
+        )
     ]
     assert all(matrix_uuid == "1234" for matrix_uuid in matrix_uuids)
 
@@ -432,12 +435,8 @@ def test_model_scoring_inspections(db_engine_with_results_schema):
     testing_labels = np.array([1, 0, np.nan, 1, 0])
     testing_prediction_probas = np.array([0.56, 0.4, 0.55, 0.5, 0.3])
 
-    training_labels = np.array(
-        [0, 0, 1, 1, 1, 0, 1, 1]
-    )
-    training_prediction_probas = np.array(
-        [0.6, 0.4, 0.55, 0.70, 0.3, 0.2, 0.8, 0.6]
-    )
+    training_labels = np.array([0, 0, 1, 1, 1, 0, 1, 1])
+    training_prediction_probas = np.array([0.6, 0.4, 0.55, 0.70, 0.3, 0.2, 0.8, 0.6])
 
     fake_train_matrix_store = MockMatrixStore(
         "train", "efgh", 5, db_engine_with_results_schema, training_labels
@@ -556,14 +555,22 @@ def test_ModelEvaluator_needs_evaluation_no_bias_audit(db_engine_with_results_sc
 
     # make a test matrix to pass in
     metadata_overrides = {
-        'as_of_date_frequency': as_of_date_frequency,
-        'as_of_times': [eval_time],
+        "as_of_date_frequency": as_of_date_frequency,
+        "as_of_times": [eval_time],
     }
     test_matrix_store = MockMatrixStore(
-        "test", "1234", 5, db_engine_with_results_schema, metadata_overrides=metadata_overrides
+        "test",
+        "1234",
+        5,
+        db_engine_with_results_schema,
+        metadata_overrides=metadata_overrides,
     )
     train_matrix_store = MockMatrixStore(
-        "train", "2345", 5, db_engine_with_results_schema, metadata_overrides=metadata_overrides
+        "train",
+        "2345",
+        5,
+        db_engine_with_results_schema,
+        metadata_overrides=metadata_overrides,
     )
 
     # the evaluated model has test evaluations for precision, but not recall,
@@ -575,10 +582,12 @@ def test_ModelEvaluator_needs_evaluation_no_bias_audit(db_engine_with_results_sc
             subset_hash = filename_friendly_hash(subset)
 
         assert ModelEvaluator(
-            testing_metric_groups=[{
-                "metrics": ["precision@", "recall@"],
-                "thresholds": {"top_n": [100]},
-            }],
+            testing_metric_groups=[
+                {
+                    "metrics": ["precision@", "recall@"],
+                    "thresholds": {"top_n": [100]},
+                }
+            ],
             training_metric_groups=[],
             db_engine=db_engine_with_results_schema,
         ).needs_evaluations(
@@ -596,10 +605,12 @@ def test_ModelEvaluator_needs_evaluation_no_bias_audit(db_engine_with_results_sc
             subset_hash = filename_friendly_hash(subset)
 
         assert not ModelEvaluator(
-            testing_metric_groups=[{
-                "metrics": ["precision@"],
-                "thresholds": {"top_n": [100]},
-            }],
+            testing_metric_groups=[
+                {
+                    "metrics": ["precision@"],
+                    "thresholds": {"top_n": [100]},
+                }
+            ],
             training_metric_groups=[],
             db_engine=db_engine_with_results_schema,
         ).needs_evaluations(
@@ -617,10 +628,12 @@ def test_ModelEvaluator_needs_evaluation_no_bias_audit(db_engine_with_results_sc
             subset_hash = filename_friendly_hash(subset)
 
         assert ModelEvaluator(
-            testing_metric_groups=[{
-                "metrics": ["precision@"],
-                "thresholds": {"top_n": [100]},
-            }],
+            testing_metric_groups=[
+                {
+                    "metrics": ["precision@"],
+                    "thresholds": {"top_n": [100]},
+                }
+            ],
             training_metric_groups=[],
             db_engine=db_engine_with_results_schema,
         ).needs_evaluations(
@@ -638,14 +651,18 @@ def test_ModelEvaluator_needs_evaluation_no_bias_audit(db_engine_with_results_sc
             subset_hash = filename_friendly_hash(subset)
 
         assert ModelEvaluator(
-            testing_metric_groups=[{
-                "metrics": ["precision@"],
-                "thresholds": {"top_n": [100]},
-            }],
-            training_metric_groups=[{
-                "metrics": ["precision@"],
-                "thresholds": {"top_n": [100]},
-            }],
+            testing_metric_groups=[
+                {
+                    "metrics": ["precision@"],
+                    "thresholds": {"top_n": [100]},
+                }
+            ],
+            training_metric_groups=[
+                {
+                    "metrics": ["precision@"],
+                    "thresholds": {"top_n": [100]},
+                }
+            ],
             db_engine=db_engine_with_results_schema,
         ).needs_evaluations(
             matrix_store=train_matrix_store,
@@ -668,9 +685,7 @@ def test_ModelEvaluator_needs_evaluation_with_bias_audit(db_engine_with_results_
             },
         ],
         training_metric_groups=[],
-        bias_config={
-            'thresholds': {'top_n': [2]}
-        },
+        bias_config={"thresholds": {"top_n": [2]}},
         db_engine=db_engine_with_results_schema,
     )
     model_with_evaluations = ModelFactory()
@@ -691,11 +706,15 @@ def test_ModelEvaluator_needs_evaluation_with_bias_audit(db_engine_with_results_
 
     # make a test matrix to pass in
     metadata_overrides = {
-        'as_of_date_frequency': as_of_date_frequency,
-        'as_of_times': [eval_time],
+        "as_of_date_frequency": as_of_date_frequency,
+        "as_of_times": [eval_time],
     }
     test_matrix_store = MockMatrixStore(
-        "test", "1234", 5, db_engine_with_results_schema, metadata_overrides=metadata_overrides
+        "test",
+        "1234",
+        5,
+        db_engine_with_results_schema,
+        metadata_overrides=metadata_overrides,
     )
     assert model_evaluator.needs_evaluations(
         matrix_store=test_matrix_store,
@@ -715,9 +734,7 @@ def test_evaluation_with_protected_df(db_engine_with_results_schema):
             },
         ],
         training_metric_groups=[],
-        bias_config={
-            'thresholds': {'top_n': [2]}
-        },
+        bias_config={"thresholds": {"top_n": [2]}},
         db_engine=db_engine_with_results_schema,
     )
     testing_labels = np.array([1, 0])
@@ -732,10 +749,12 @@ def test_evaluation_with_protected_df(db_engine_with_results_schema):
         train_end_time=TRAIN_END_TIME,
     )
 
-    protected_df = pd.DataFrame({
-        "entity_id": fake_test_matrix_store.design_matrix.index.levels[0].tolist(),
-        "protectedattribute1": "value1"
-    })
+    protected_df = pd.DataFrame(
+        {
+            "entity_id": fake_test_matrix_store.design_matrix.index.levels[0].tolist(),
+            "protectedattribute1": "value1",
+        }
+    )
 
     model_evaluator.evaluate(
         testing_prediction_probas, fake_test_matrix_store, model_id, protected_df
@@ -746,10 +765,10 @@ def test_evaluation_with_protected_df(db_engine_with_results_schema):
         order by 1""",
         (model_id, fake_test_matrix_store.as_of_dates[0]),
     ):
-        assert record['model_id'] == model_id
-        assert record['parameter'] == '2_abs'
-        assert record['attribute_name'] == 'protectedattribute1'
-        assert record['attribute_value'] == 'value1'
+        assert record["model_id"] == model_id
+        assert record["parameter"] == "2_abs"
+        assert record["attribute_name"] == "protectedattribute1"
+        assert record["attribute_value"] == "value1"
 
 
 def test_evaluation_sorting_with_protected_df(db_engine_with_results_schema):
@@ -763,34 +782,37 @@ def test_evaluation_sorting_with_protected_df(db_engine_with_results_schema):
             },
         ],
         training_metric_groups=[],
-        bias_config={
-            'thresholds': {'top_n': [2]}
-        },
+        bias_config={"thresholds": {"top_n": [2]}},
         db_engine=db_engine_with_results_schema,
     )
     testing_labels = np.array([1, 1, 1, 0, 1])
     testing_prediction_probas = np.array([0.56, 0.55, 0.92, 0.85, 0.24])
 
     fake_test_matrix_store = MockMatrixStore(
-        "test", "1234", 5, db_engine_with_results_schema,
-        metadata_overrides={'as_of_times': [TRAIN_END_TIME]},
+        "test",
+        "1234",
+        5,
+        db_engine_with_results_schema,
+        metadata_overrides={"as_of_times": [TRAIN_END_TIME]},
         matrix=pd.DataFrame.from_dict(
-                {
-                    "entity_id": [1, 2, 3, 4, 5],
-                    "as_of_date": [pd.Timestamp(2016, 1, 1)]*5,
-                    "feature_one": [3, 4, 3, 4, 3],
-                    "feature_two": [5, 6, 5, 6, 5],
-                    "label": testing_labels,
-                }
-            ).set_index(MatrixStore.indices),
+            {
+                "entity_id": [1, 2, 3, 4, 5],
+                "as_of_date": [pd.Timestamp(2016, 1, 1)] * 5,
+                "feature_one": [3, 4, 3, 4, 3],
+                "feature_two": [5, 6, 5, 6, 5],
+                "label": testing_labels,
+            }
+        ).set_index(MatrixStore.indices),
         init_labels=pd.DataFrame(
             {
                 "label_value": testing_labels,
                 "entity_id": [1, 2, 3, 4, 5],
-                "as_of_date": [pd.Timestamp(2016, 1, 1)]*5,
+                "as_of_date": [pd.Timestamp(2016, 1, 1)] * 5,
             }
-        ).set_index(["entity_id", "as_of_date"]).label_value,
-        init_as_of_dates=[TRAIN_END_TIME]
+        )
+        .set_index(["entity_id", "as_of_date"])
+        .label_value,
+        init_as_of_dates=[TRAIN_END_TIME],
     )
 
     trained_model, model_id = fake_trained_model(
@@ -798,16 +820,19 @@ def test_evaluation_sorting_with_protected_df(db_engine_with_results_schema):
         train_end_time=TRAIN_END_TIME,
     )
 
-    protected_df = pd.DataFrame({
-        # "entity_id": fake_test_matrix_store.design_matrix.index.levels[0].tolist(),
-        # "as_of_date": fake_test_matrix_store.design_matrix.index.levels[1].tolist(),
-        "protectedattribute1": ["low", "low", "low", "high", "high"]
-    }, index=fake_test_matrix_store.design_matrix.index)
+    protected_df = pd.DataFrame(
+        {
+            # "entity_id": fake_test_matrix_store.design_matrix.index.levels[0].tolist(),
+            # "as_of_date": fake_test_matrix_store.design_matrix.index.levels[1].tolist(),
+            "protectedattribute1": ["low", "low", "low", "high", "high"]
+        },
+        index=fake_test_matrix_store.design_matrix.index,
+    )
     # should be low has 3 records, all 1's; high has 2 records, one 1
 
     expected = {
         "low": {"group_size": 3, "group_label_neg": 0, "group_label_pos": 3},
-        "high": {"group_size": 2, "group_label_neg": 1, "group_label_pos": 1}
+        "high": {"group_size": 2, "group_label_neg": 1, "group_label_pos": 1},
     }
 
     model_evaluator.evaluate(
@@ -820,30 +845,24 @@ def test_evaluation_sorting_with_protected_df(db_engine_with_results_schema):
         order by 1""",
         (model_id, fake_test_matrix_store.as_of_dates[0]),
     ):
-        assert record['model_id'] == model_id
-        assert record['parameter'] == '2_abs'
-        assert record['attribute_name'] == 'protectedattribute1'
-        for col, value in expected[record['attribute_value']].items():
+        assert record["model_id"] == model_id
+        assert record["parameter"] == "2_abs"
+        assert record["attribute_name"] == "protectedattribute1"
+        for col, value in expected[record["attribute_value"]].items():
             assert record[col] == value
 
 
-
 def test_generate_binary_at_x():
-    input_array = np.array(
-        [0.9, 0.8, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.6])
+    input_array = np.array([0.9, 0.8, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.6])
 
     # bug can arise when the same value spans both sides of threshold
     assert_array_equal(
         generate_binary_at_x(input_array, 50, "percentile"),
-        np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
+        np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0]),
     )
 
     assert_array_equal(
-        generate_binary_at_x(input_array, 2),
-        np.array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0])
+        generate_binary_at_x(input_array, 2), np.array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0])
     )
 
-    assert_array_equal(
-        generate_binary_at_x(np.array([]), 2),
-        np.array([])
-    )
+    assert_array_equal(generate_binary_at_x(np.array([]), 2), np.array([]))
diff --git a/src/tests/catwalk_tests/test_storage.py b/src/tests/catwalk_tests/test_storage.py
index 53c0a589d..6dbb019fb 100644
--- a/src/tests/catwalk_tests/test_storage.py
+++ b/src/tests/catwalk_tests/test_storage.py
@@ -25,31 +25,38 @@
 
 
 class SomeClass:
-
     def __init__(self, val):
         self.val = val
 
 
+@mock_s3
 def test_S3Store():
-    with mock_s3():
-        client = boto3.client("s3")
-        client.create_bucket(Bucket="test_bucket", ACL="public-read-write")
-        store = S3Store(f"s3://test_bucket/a_path")
-        assert not store.exists()
-        store.write("val".encode("utf-8"))
-        assert store.exists()
-        newVal = store.load()
-        assert newVal.decode("utf-8") == "val"
-        store.delete()
-        assert not store.exists()
+    client = boto3.client("s3")
+    client.create_bucket(
+        Bucket="test_bucket",
+        ACL="public-read-write",
+        CreateBucketConfiguration={"LocationConstraint": "us-east-2"},
+    )
+    store = S3Store(f"s3://test_bucket/a_path")
+    assert not store.exists()
+    store.write("val".encode("utf-8"))
+    assert store.exists()
+    newVal = store.load()
+    assert newVal.decode("utf-8") == "val"
+    store.delete()
+    assert not store.exists()
 
 
 @mock_s3
 def test_S3Store_large():
-    client = boto3.client('s3')
-    client.create_bucket(Bucket='test_bucket', ACL='public-read-write')
+    client = boto3.client("s3")
+    client.create_bucket(
+        Bucket="test_bucket",
+        ACL="public-read-write",
+        CreateBucketConfiguration={"LocationConstraint": "us-east-2"},
+    )
 
-    store = S3Store('s3://test_bucket/a_path')
+    store = S3Store("s3://test_bucket/a_path")
     assert not store.exists()
 
     # NOTE: The issue under test (currently) arises when too large a "part"
@@ -77,24 +84,24 @@ def test_S3Store_large():
     # NOTE: to the point of self-invalidation. (But, this should do the
     # NOTE: trick; and, we can always increase the payload size here, or
     # NOTE: otherwise tweak configuration, as necessary.)
-    one_mb = 2 ** 20
+    one_mb = 2**20
     payload = b"0" * (10 * one_mb)  # 10MiB text of all zeros
 
-    with CallSpy('botocore.client.BaseClient._make_api_call') as spy:
+    with CallSpy("botocore.client.BaseClient._make_api_call") as spy:
         store.write(payload)
 
     call_args = [call[0] for call in spy.calls]
     call_methods = [args[1] for args in call_args]
 
     assert call_methods == [
-        'CreateMultipartUpload',
-        'UploadPart',
-        'UploadPart',
-        'CompleteMultipartUpload',
+        "CreateMultipartUpload",
+        "UploadPart",
+        "UploadPart",
+        "CompleteMultipartUpload",
     ]
 
     upload_args = call_args[1]
-    upload_body = upload_args[2]['Body']
+    upload_body = upload_args[2]["Body"]
 
     # NOTE: Why is this a BufferIO rather than the underlying buffer?!
     # NOTE: (Would have expected the result of BufferIO.read() -- str.)
@@ -123,22 +130,22 @@ def test_FSStore():
 
 def test_ModelStorageEngine_nocaching(project_storage):
     mse = ModelStorageEngine(project_storage)
-    mse.write('testobject', 'myhash')
-    assert mse.exists('myhash')
-    assert mse.load('myhash') == 'testobject'
-    assert 'myhash' not in mse.cache
+    mse.write("testobject", "myhash")
+    assert mse.exists("myhash")
+    assert mse.load("myhash") == "testobject"
+    assert "myhash" not in mse.cache
 
 
 def test_ModelStorageEngine_caching(project_storage):
     mse = ModelStorageEngine(project_storage)
     with mse.cache_models():
-        mse.write('testobject', 'myhash')
+        mse.write("testobject", "myhash")
         with mock.patch.object(mse, "_get_store") as get_store_mock:
-            assert mse.load('myhash') == 'testobject'
+            assert mse.load("myhash") == "testobject"
             assert not get_store_mock.called
-        assert 'myhash' in mse.cache
+        assert "myhash" in mse.cache
     # when cache_models goes out of scope the cache should be empty
-    assert 'myhash' not in mse.cache
+    assert "myhash" not in mse.cache
 
 
 DATA_DICT = OrderedDict(
@@ -240,7 +247,7 @@ def test_MatrixStore_save():
         "as_of_date": [pd.Timestamp(2017, 1, 1), pd.Timestamp(2017, 1, 1)],
         "feature_one": [0.5, 0.6],
         "feature_two": [0.5, 0.6],
-        "label": [1, 0]
+        "label": [1, 0],
     }
     df = pd.DataFrame.from_dict(data)
     labels = df.pop("label")
@@ -250,10 +257,7 @@ def test_MatrixStore_save():
 
         matrix_store.matrix_label_tuple = df, labels
         matrix_store.save()
-        assert_frame_equal(
-            matrix_store.design_matrix,
-            df
-        )
+        assert_frame_equal(matrix_store.design_matrix, df)
 
 
 def test_MatrixStore_caching():
@@ -276,7 +280,7 @@ def test_as_of_dates(project_storage):
             pd.Timestamp(2017, 1, 1),
             pd.Timestamp(2017, 1, 1),
         ],
-        "label": [1, 0, 1, 0]
+        "label": [1, 0, 1, 0],
     }
     df = pd.DataFrame.from_dict(data)
     matrix_store = CSVMatrixStore(
@@ -284,25 +288,32 @@ def test_as_of_dates(project_storage):
         [],
         "test",
         matrix=df,
-        metadata={"indices": ["entity_id", "as_of_date"], "label_name": "label"}
+        metadata={"indices": ["entity_id", "as_of_date"], "label_name": "label"},
     )
-    assert matrix_store.as_of_dates == [datetime.date(2016, 1, 1), datetime.date(2017, 1, 1)]
+    assert matrix_store.as_of_dates == [
+        datetime.date(2016, 1, 1),
+        datetime.date(2017, 1, 1),
+    ]
 
 
+@mock_s3
 def test_s3_save():
-    with mock_s3():
-        client = boto3.client("s3")
-        client.create_bucket(Bucket="fake-matrix-bucket", ACL="public-read-write")
-        for example in matrix_stores():
-            if not isinstance(example, CSVMatrixStore):
-                continue
-            project_storage = ProjectStorage("s3://fake-matrix-bucket")
-
-            tosave = CSVMatrixStore(project_storage, [], "test")
-            tosave.metadata = example.metadata
-            tosave.matrix_label_tuple = example.matrix_label_tuple
-            tosave.save()
-
-            tocheck = CSVMatrixStore(project_storage, [], "test")
-            assert tocheck.metadata == example.metadata
-            assert tocheck.design_matrix.to_dict() == example.design_matrix.to_dict()
+    client = boto3.client("s3")
+    client.create_bucket(
+        Bucket="fake-matrix-bucket",
+        ACL="public-read-write",
+        CreateBucketConfiguration={"LocationConstraint": "us-east-2"},
+    )
+    for example in matrix_stores():
+        if not isinstance(example, CSVMatrixStore):
+            continue
+        project_storage = ProjectStorage("s3://fake-matrix-bucket")
+
+        tosave = CSVMatrixStore(project_storage, [], "test")
+        tosave.metadata = example.metadata
+        tosave.matrix_label_tuple = example.matrix_label_tuple
+        tosave.save()
+
+        tocheck = CSVMatrixStore(project_storage, [], "test")
+        assert tocheck.metadata == example.metadata
+        assert tocheck.design_matrix.to_dict() == example.design_matrix.to_dict()
diff --git a/src/triage/component/catwalk/evaluation.py b/src/triage/component/catwalk/evaluation.py
index b445393a1..f6f3540bb 100644
--- a/src/triage/component/catwalk/evaluation.py
+++ b/src/triage/component/catwalk/evaluation.py
@@ -1,6 +1,7 @@
 import functools
 import itertools
 import verboselogs, logging
+
 logger = verboselogs.VerboseLogger(__name__)
 import math
 
@@ -22,7 +23,7 @@
     db_retry,
     sort_predictions_and_labels,
     get_subset_table_name,
-    filename_friendly_hash
+    filename_friendly_hash,
 )
 from triage.util.db import scoped_session
 from triage.util.random import generate_python_random_seed
@@ -32,7 +33,6 @@
 SORT_TRIALS = 30
 
 
-
 def subset_labels_and_predictions(
     subset_df,
     labels,
@@ -61,7 +61,11 @@ def subset_labels_and_predictions(
     # The subset isn't specific to the cohort, so inner join to the labels/predictions
     labels_subset = labels.align(subset_df, join="inner")[0]
     predictions_subset = indexed_predictions.align(subset_df, join="inner")[0].values
-    protected_df_subset = protected_df if protected_df.empty else protected_df.align(subset_df, join="inner")[0]
+    protected_df_subset = (
+        protected_df
+        if protected_df.empty
+        else protected_df.align(subset_df, join="inner")[0]
+    )
     logger.spam(
         f"{len(labels_subset)} entities in subset out of {len(labels)} in matrix.",
     )
@@ -82,7 +86,9 @@ def query_subset_table(db_engine, as_of_dates, subset_table_name):
         active in the subset
     """
     as_of_dates_sql = "[{}]".format(
-        ", ".join("'{}'".format(date.strftime("%Y-%m-%d %H:%M:%S.%f")) for date in as_of_dates)
+        ", ".join(
+            "'{}'".format(date.strftime("%Y-%m-%d %H:%M:%S.%f")) for date in as_of_dates
+        )
     )
     query_string = f"""
         with dates as (
@@ -92,8 +98,12 @@ def query_subset_table(db_engine, as_of_dates, subset_table_name):
         from {subset_table_name}
         join dates using(as_of_date)
     """
-    df = pd.DataFrame.pg_copy_from(query_string, connectable=db_engine, parse_dates=["as_of_date"],
-        index_col=MatrixStore.indices)
+    df = pd.DataFrame.pg_copy_from(
+        query_string,
+        connectable=db_engine,
+        parse_dates=["as_of_date"],
+        index_col=MatrixStore.indices,
+    )
     return df
 
 
@@ -116,7 +126,9 @@ def generate_binary_at_x(test_predictions, x_value, unit="top_n"):
     else:
         cutoff_index = int(x_value)
     num_ones = cutoff_index if cutoff_index <= len_predictions else len_predictions
-    num_zeroes = len_predictions - cutoff_index if cutoff_index <= len_predictions else 0
+    num_zeroes = (
+        len_predictions - cutoff_index if cutoff_index <= len_predictions else 0
+    )
     test_predictions_binary = np.concatenate(
         (np.ones(num_ones, np.int8), np.zeros(num_zeroes, np.int8))
     )
@@ -125,6 +137,7 @@ def generate_binary_at_x(test_predictions, x_value, unit="top_n"):
 
 class MetricDefinition(typing.NamedTuple):
     """A single metric, bound to a particular threshold and parameter combination"""
+
     metric: str
     threshold_unit: str
     threshold_value: int
@@ -137,6 +150,7 @@ class MetricEvaluationResult(typing.NamedTuple):
 
     The 'value' could represent the worst, best, or a random version of tiebreaking.
     """
+
     metric: str
     parameter: str
     value: float
@@ -224,8 +238,7 @@ def _validate_metrics(self, custom_metrics):
         for name, met in custom_metrics.items():
             if not hasattr(met, "greater_is_better"):
                 raise ValueError(
-                    f"Custom metric {name} missing greater_is_better "
-                    f"attribute"
+                    f"Custom metric {name} missing greater_is_better " f"attribute"
                 )
             elif met.greater_is_better not in (True, False):
                 raise ValueError(
@@ -317,7 +330,7 @@ def _flatten_metric_threshold(
                     parameter_string=parameter_string,
                     parameter_combination=parameter_combination,
                     threshold_unit=threshold_unit,
-                    threshold_value=threshold_value
+                    threshold_value=threshold_value,
                 )
                 metric_definitions.append(result)
         return metric_definitions
@@ -390,7 +403,7 @@ def metric_definitions_from_matrix_type(self, matrix_type):
         else:
             return self._flatten_metric_config_groups(self.training_metric_groups)
 
-    def needs_evaluations(self, matrix_store, model_id, subset_hash=''):
+    def needs_evaluations(self, matrix_store, model_id, subset_hash=""):
         """Returns whether or not all the configured metrics are present in the
         database for the given matrix and model.
         Args:
@@ -412,24 +425,31 @@ def needs_evaluations(self, matrix_store, model_id, subset_hash=''):
         # by querying the unique metrics and parameters relevant to the passed-in matrix
         session = self.sessionmaker()
 
-        evaluation_objects_in_db = session.query(eval_obj).filter_by(
-            model_id=model_id,
-            evaluation_start_time=matrix_store.as_of_dates[0],
-            evaluation_end_time=matrix_store.as_of_dates[-1],
-            as_of_date_frequency=matrix_store.metadata["as_of_date_frequency"],
-            subset_hash=subset_hash,
-        ).distinct(eval_obj.metric, eval_obj.parameter).all()
+        evaluation_objects_in_db = (
+            session.query(eval_obj)
+            .filter_by(
+                model_id=model_id,
+                evaluation_start_time=matrix_store.as_of_dates[0],
+                evaluation_end_time=matrix_store.as_of_dates[-1],
+                as_of_date_frequency=matrix_store.metadata["as_of_date_frequency"],
+                subset_hash=subset_hash,
+            )
+            .distinct(eval_obj.metric, eval_obj.parameter)
+            .all()
+        )
 
         # The list of needed metrics and parameters are all the unique metric/params from the config
         # not present in the unique metric/params from the db
 
         evals_needed = bool(
-            {(met.metric, met.parameter_string) for met in metric_definitions} -
-            {(obj.metric, obj.parameter) for obj in evaluation_objects_in_db}
+            {(met.metric, met.parameter_string) for met in metric_definitions}
+            - {(obj.metric, obj.parameter) for obj in evaluation_objects_in_db}
         )
         session.close()
         if evals_needed:
-            logger.notice(f"Needed evaluations for model {model_id} on matrix {matrix_store.uuid} are missing")
+            logger.notice(
+                f"Needed evaluations for model {model_id} on matrix {matrix_store.uuid} are missing"
+            )
             return True
 
         # now check bias config if there
@@ -453,8 +473,12 @@ def _compute_evaluations(self, predictions_proba, labels, metric_definitions):
         Returns: (list of MetricEvaluationResult objects) One result for each metric definition
         """
         evals = []
-        for (threshold_unit, threshold_value), metrics_for_threshold, in \
-                itertools.groupby(metric_definitions, lambda m: (m.threshold_unit, m.threshold_value)):
+        for (
+            (threshold_unit, threshold_value),
+            metrics_for_threshold,
+        ) in itertools.groupby(
+            metric_definitions, lambda m: (m.threshold_unit, m.threshold_value)
+        ):
             predicted_classes = generate_binary_at_x(
                 predictions_proba, threshold_value, unit=threshold_unit
             )
@@ -463,7 +487,9 @@ def _compute_evaluations(self, predictions_proba, labels, metric_definitions):
                 predicted_classes, labels
             )
             num_labeled_examples = len(present_labels)
-            num_labeled_above_threshold = np.count_nonzero(predicted_classes_with_labels)
+            num_labeled_above_threshold = np.count_nonzero(
+                predicted_classes_with_labels
+            )
             num_positive_labels = np.count_nonzero(present_labels)
             for metric_def in metrics_for_threshold:
                 # using threshold configuration, convert probabilities to predicted classes
@@ -500,7 +526,9 @@ def _compute_evaluations(self, predictions_proba, labels, metric_definitions):
                 evals.append(result)
         return evals
 
-    def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None, subset=None):
+    def evaluate(
+        self, predictions_proba, matrix_store, model_id, protected_df=None, subset=None
+    ):
         """Evaluate a model based on predictions, and save the results
 
         Args:
@@ -515,26 +543,32 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
         # If we are evaluating on a subset, we want to get just the labels and
         # predictions for the included entity-date pairs
         if subset:
-            logger.verbose(f"Subsetting labels and predictions of model {model_id} on matrix {matrix_store.uuid}")
+            logger.verbose(
+                f"Subsetting labels and predictions of model {model_id} on matrix {matrix_store.uuid}"
+            )
             labels, predictions_proba, protected_df = subset_labels_and_predictions(
-                    subset_df=query_subset_table(
-                        self.db_engine,
-                        matrix_store.as_of_dates,
-                        get_subset_table_name(subset),
-                    ),
-                    predictions_proba=predictions_proba,
-                    labels=matrix_store.labels,
-                    protected_df=protected_df
+                subset_df=query_subset_table(
+                    self.db_engine,
+                    matrix_store.as_of_dates,
+                    get_subset_table_name(subset),
+                ),
+                predictions_proba=predictions_proba,
+                labels=matrix_store.labels,
+                protected_df=protected_df,
             )
             subset_hash = filename_friendly_hash(subset)
         else:
-            logger.debug(f"Using all the predictions of model {model_id} on matrix {matrix_store.uuid} for evaluation (i.e. no subset)")
+            logger.debug(
+                f"Using all the predictions of model {model_id} on matrix {matrix_store.uuid} for evaluation (i.e. no subset)"
+            )
             labels = matrix_store.labels
             subset_hash = ""
 
         # confirm protected_df and labels have same set and count of values
         if (protected_df is not None) and (not protected_df.empty):
-            if (protected_df.index.shape != labels.index.shape) or (not protected_df.index.symmetric_difference(labels.index).empty):
+            if (protected_df.index.shape != labels.index.shape) or (
+                not protected_df.index.symmetric_difference(labels.index).empty
+            ):
                 raise ValueError("Mismatch between protected_df and labels indices")
 
         df_index = labels.index
@@ -546,32 +580,46 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
         logger.spam(f"Found {len(metric_defs)} metric definitions total")
 
         # 1. get worst sorting
-        predictions_proba_worst, labels_worst, df_index_worst = sort_predictions_and_labels(
+        (
+            predictions_proba_worst,
+            labels_worst,
+            df_index_worst,
+        ) = sort_predictions_and_labels(
             predictions_proba=predictions_proba,
             labels=labels,
             df_index=df_index,
-            tiebreaker='worst',
+            tiebreaker="worst",
         )
         worst_lookup = {
             (eval.metric, eval.parameter): eval
-            for eval in
-            self._compute_evaluations(predictions_proba_worst, labels_worst, metric_defs)
+            for eval in self._compute_evaluations(
+                predictions_proba_worst, labels_worst, metric_defs
+            )
         }
-        logger.debug(f'Predictions from {model_id} sorted by worst case scenario, i.e. all negative and NULL labels first')
+        logger.debug(
+            f"Predictions from {model_id} sorted by worst case scenario, i.e. all negative and NULL labels first"
+        )
 
         # 2. get best sorting
-        predictions_proba_best, labels_best, df_index_best = sort_predictions_and_labels(
+        (
+            predictions_proba_best,
+            labels_best,
+            df_index_best,
+        ) = sort_predictions_and_labels(
             predictions_proba=predictions_proba_worst,
             labels=labels_worst,
             df_index=df_index_worst,
-            tiebreaker='best',
+            tiebreaker="best",
         )
         best_lookup = {
             (eval.metric, eval.parameter): eval
-            for eval in
-            self._compute_evaluations(predictions_proba_best, labels_best, metric_defs)
+            for eval in self._compute_evaluations(
+                predictions_proba_best, labels_best, metric_defs
+            )
         }
-        logger.debug(f'Predictions from {model_id} sorted by best case scenario, i.e. all positive labels first, NULL labels at the end')
+        logger.debug(
+            f"Predictions from {model_id} sorted by best case scenario, i.e. all positive labels first, NULL labels at the end"
+        )
 
         evals_without_trials = dict()
 
@@ -581,8 +629,16 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
         for metric_def in metric_defs:
             worst_eval = worst_lookup[(metric_def.metric, metric_def.parameter_string)]
             best_eval = best_lookup[(metric_def.metric, metric_def.parameter_string)]
-            if worst_eval.value is None or best_eval.value is None or math.isclose(worst_eval.value, best_eval.value, rel_tol=RELATIVE_TOLERANCE):
-                evals_without_trials[(worst_eval.metric, worst_eval.parameter)] = worst_eval.value
+            if (
+                worst_eval.value is None
+                or best_eval.value is None
+                or math.isclose(
+                    worst_eval.value, best_eval.value, rel_tol=RELATIVE_TOLERANCE
+                )
+            ):
+                evals_without_trials[
+                    (worst_eval.metric, worst_eval.parameter)
+                ] = worst_eval.value
             else:
                 metric_defs_to_trial.append(metric_def)
 
@@ -594,19 +650,23 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
         random_eval_accumulator = defaultdict(list)
         for _ in range(0, SORT_TRIALS):
             sort_seed = generate_python_random_seed()
-            predictions_proba_random, labels_random, df_index_random = sort_predictions_and_labels(
+            (
+                predictions_proba_random,
+                labels_random,
+                df_index_random,
+            ) = sort_predictions_and_labels(
                 predictions_proba=predictions_proba_worst,
                 labels=labels_worst,
                 df_index=df_index_worst,
-                tiebreaker='random',
-                sort_seed=sort_seed
+                tiebreaker="random",
+                sort_seed=sort_seed,
             )
             for random_eval in self._compute_evaluations(
-                    predictions_proba_random,
-                    labels_random,
-                    metric_defs_to_trial
+                predictions_proba_random, labels_random, metric_defs_to_trial
             ):
-                random_eval_accumulator[(random_eval.metric, random_eval.parameter)].append(random_eval.value)
+                random_eval_accumulator[
+                    (random_eval.metric, random_eval.parameter)
+                ].append(random_eval.value)
 
         # 5. flatten best, worst, stochastic results for each metric definition
         # into database records
@@ -622,16 +682,30 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
                 standard_deviation = 0
                 num_sort_trials = 0
             else:
-                trial_results = [value for value in random_eval_accumulator[metric_key] if value is not None]
+                trial_results = [
+                    value
+                    for value in random_eval_accumulator[metric_key]
+                    if value is not None
+                ]
                 stochastic_value = statistics.mean(trial_results)
-                standard_deviation = statistics.stdev(trial_results)
+                try:
+                    standard_deviation = statistics.stdev(trial_results)
+                except ValueError:
+                    logger.warning(
+                        f"{metric_def.metric} not defined for parameter {metric_def.parameter_combination} because all values "
+                        "are NaN. Inserting NULL for value."
+                    )
+                    standard_deviation = None
+
                 num_sort_trials = len(trial_results)
 
             evaluation = matrix_type.evaluation_obj(
                 metric=metric_def.metric,
                 parameter=metric_def.parameter_string,
                 num_labeled_examples=worst_lookup[metric_key].num_labeled_examples,
-                num_labeled_above_threshold=worst_lookup[metric_key].num_labeled_above_threshold,
+                num_labeled_above_threshold=worst_lookup[
+                    metric_key
+                ].num_labeled_above_threshold,
                 num_positive_labels=worst_lookup[metric_key].num_positive_labels,
                 worst_value=worst_lookup[metric_key].value,
                 best_value=best_lookup[metric_key].value,
@@ -657,23 +731,25 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
                 protected_df=protected_df.reindex(df_index_worst),
                 predictions_proba=predictions_proba_worst,
                 labels=labels_worst,
-                tie_breaker='worst',
+                tie_breaker="worst",
                 subset_hash=subset_hash,
                 matrix_type=matrix_type,
                 evaluation_start_time=evaluation_start_time,
                 evaluation_end_time=evaluation_end_time,
-                matrix_uuid=matrix_store.uuid)
+                matrix_uuid=matrix_store.uuid,
+            )
             self._write_audit_to_db(
                 model_id=model_id,
                 protected_df=protected_df.reindex(df_index_best),
                 predictions_proba=predictions_proba_best,
                 labels=labels_best,
-                tie_breaker='best',
+                tie_breaker="best",
                 subset_hash=subset_hash,
                 matrix_type=matrix_type,
                 evaluation_start_time=evaluation_start_time,
                 evaluation_end_time=evaluation_end_time,
-                matrix_uuid=matrix_store.uuid)
+                matrix_uuid=matrix_store.uuid,
+            )
 
     def _write_audit_to_db(
         self,
@@ -686,7 +762,7 @@ def _write_audit_to_db(
         matrix_type,
         evaluation_start_time,
         evaluation_end_time,
-        matrix_uuid
+        matrix_uuid,
     ):
         """
         Runs the bias audit and saves the result in the bias table.
@@ -716,58 +792,69 @@ def _write_audit_to_db(
         # score, label value, model_id, protected attributes
         # fill out the protected_df, which just has protected attributes at this point
         protected_df = protected_df.copy()
-        protected_df['model_id'] = model_id
-        protected_df['score'] = predictions_proba
-        protected_df['label_value'] = labels
+        protected_df["model_id"] = model_id
+        protected_df["score"] = predictions_proba
+        protected_df["label_value"] = labels
         aequitas_df, attr_cols_input = preprocess_input_df(protected_df)
 
         # create group crosstabs
         g = Group()
         score_thresholds = {}
-        score_thresholds['rank_abs'] = self.bias_config['thresholds'].get('top_n', [])
+        score_thresholds["rank_abs"] = self.bias_config["thresholds"].get("top_n", [])
         # convert 0-100 percentile to 0-1 that Aequitas expects
-        score_thresholds['rank_pct'] = [value / 100.0 for value in self.bias_config['thresholds'].get('percentiles', [])]
-        groups_model, attr_cols = g.get_crosstabs(aequitas_df,
-                                                  score_thresholds=score_thresholds,
-                                                  attr_cols=attr_cols_input)
+        score_thresholds["rank_pct"] = [
+            value / 100.0
+            for value in self.bias_config["thresholds"].get("percentiles", [])
+        ]
+        groups_model, attr_cols = g.get_crosstabs(
+            aequitas_df, score_thresholds=score_thresholds, attr_cols=attr_cols_input
+        )
         # analyze bias from reference groups
         bias = Bias()
-        ref_groups_method = self.bias_config.get('ref_groups_method', None)
-        if ref_groups_method == 'predefined' and self.bias_config['ref_groups']:
-            bias_df = bias.get_disparity_predefined_groups(groups_model, aequitas_df, self.bias_config['ref_groups'])
-        elif ref_groups_method == 'majority':
+        ref_groups_method = self.bias_config.get("ref_groups_method", None)
+        if ref_groups_method == "predefined" and self.bias_config["ref_groups"]:
+            bias_df = bias.get_disparity_predefined_groups(
+                groups_model, aequitas_df, self.bias_config["ref_groups"]
+            )
+        elif ref_groups_method == "majority":
             bias_df = bias.get_disparity_major_group(groups_model, aequitas_df)
         else:
             bias_df = bias.get_disparity_min_metric(groups_model, aequitas_df)
 
         # analyze fairness for each group
-        f = Fairness(tau=0.8) # the default fairness threshold is 0.8
+        f = Fairness(tau=0.8)  # the default fairness threshold is 0.8
         group_value_df = f.get_group_value_fairness(bias_df)
-        group_value_df['subset_hash'] = subset_hash
-        group_value_df['tie_breaker'] = tie_breaker
-        group_value_df['evaluation_start_time'] = evaluation_start_time
-        group_value_df['evaluation_end_time'] = evaluation_end_time
-        group_value_df['matrix_uuid'] = matrix_uuid
-        group_value_df = group_value_df.rename(index=str, columns={"score_threshold": "parameter", "for": "for_"})
+        group_value_df["subset_hash"] = subset_hash
+        group_value_df["tie_breaker"] = tie_breaker
+        group_value_df["evaluation_start_time"] = evaluation_start_time
+        group_value_df["evaluation_end_time"] = evaluation_end_time
+        group_value_df["matrix_uuid"] = matrix_uuid
+        group_value_df = group_value_df.rename(
+            index=str, columns={"score_threshold": "parameter", "for": "for_"}
+        )
         if group_value_df.empty:
-            raise ValueError(f"""
+            raise ValueError(
+                f"""
             Bias audit: aequitas_audit() failed.
             Returned empty dataframe for model_id = {model_id}, and subset_hash = {subset_hash}
-            and matrix_type = {matrix_type}""")
+            and matrix_type = {matrix_type}"""
+            )
         with scoped_session(self.db_engine) as session:
             for index, row in group_value_df.iterrows():
                 session.query(matrix_type.aequitas_obj).filter_by(
-                    model_id=row['model_id'],
-                    evaluation_start_time=row['evaluation_start_time'],
-                    evaluation_end_time=row['evaluation_end_time'],
-                    subset_hash=row['subset_hash'],
-                    parameter=row['parameter'],
-                    tie_breaker=row['tie_breaker'],
-                    matrix_uuid=row['matrix_uuid'],
-                    attribute_name=row['attribute_name'],
-                    attribute_value=row['attribute_value']
+                    model_id=row["model_id"],
+                    evaluation_start_time=row["evaluation_start_time"],
+                    evaluation_end_time=row["evaluation_end_time"],
+                    subset_hash=row["subset_hash"],
+                    parameter=row["parameter"],
+                    tie_breaker=row["tie_breaker"],
+                    matrix_uuid=row["matrix_uuid"],
+                    attribute_name=row["attribute_name"],
+                    attribute_value=row["attribute_value"],
                 ).delete()
-            session.bulk_insert_mappings(matrix_type.aequitas_obj, group_value_df.to_dict(orient="records"))
+            session.bulk_insert_mappings(
+                matrix_type.aequitas_obj, group_value_df.to_dict(orient="records")
+            )
 
     @db_retry
     def _write_to_db(
@@ -805,7 +892,7 @@ def _write_to_db(
                 evaluation_start_time=evaluation_start_time,
                 evaluation_end_time=evaluation_end_time,
                 as_of_date_frequency=as_of_date_frequency,
-                subset_hash=subset_hash
+                subset_hash=subset_hash,
             ).delete()
 
             for evaluation in evaluations:
diff --git a/src/triage/component/postmodeling/crosstabs.py b/src/triage/component/postmodeling/crosstabs.py
index 6ab79bf85..5cc67d742 100644
--- a/src/triage/component/postmodeling/crosstabs.py
+++ b/src/triage/component/postmodeling/crosstabs.py
@@ -1,5 +1,7 @@
+import ohio.ext.pandas
 import pandas as pd
 import verboselogs, logging
+
 logger = verboselogs.VerboseLogger(__name__)
 
 from scipy import stats
@@ -42,9 +44,11 @@ def __init__(self, config_path=None, config=None):
 
 
 def hr_lr_ttest(hr, lr):
-    """ Returns the t-test (T statistic and p value), comparing the features for
-    high- and low-risk entities. """
-    res = stats.ttest_ind(hr.to_numpy(), lr.to_numpy(), axis=0, nan_policy="omit", equal_var=False)
+    """Returns the t-test (T statistic and p value), comparing the features for
+    high- and low-risk entities."""
+    res = stats.ttest_ind(
+        hr.to_numpy(), lr.to_numpy(), axis=0, nan_policy="omit", equal_var=False
+    )
 
     r0 = pd.Series(res[0], index=hr.columns)
     r1 = pd.Series(res[1], index=hr.columns)
@@ -84,43 +88,53 @@ def populate_crosstabs_table(
     schema="results",
     table="crosstabs",
 ):
-    """ This function populates the results.crosstabs table.
-
-        Args:
-            entity_id_list (list): A list of entity_ids. Crosstabs will
-                                   be calculated using only predictions and
-                                   features of these entities. This list is
-                                   being passed to get_predictions_query and
-                                   get_features_query.
-            crosstab_functions (dict): A dictionary of function names and
-                                   functions. The functions receive the feature
-                                   dataframes for entities that were predicted
-                                   postive or negative, respectively. Each function
-                                   returns a Series or DataFrame with rows now
-                                   indexed by the feature names, and columns
-                                   corresponding to the statistics that the
-                                   function calculated.
-            thresholds (dict): A dictionary that maps column names to lists of
-                               threshold values. The (name, threshold) pairs will
-                               be applied to the dataframes with predictions to
-                               split low- and high-risk entities.
-            push_to_db (bool): If True, the crosstab table is being populated.
-            return_df (bool): If True, a dataframe corresponding to the crosstabe
-                              table will be returned.
-            engine (SQLAlchemy engine): DB connection
-            schema (str): Name of the schema to store results; defaults to 'results'
-            table (str): Name of table to store results; defaults to 'crosstabs'
-
-            Returns: A DataFrame, corresponding to results.crosstabs
+    """This function populates the results.crosstabs table.
+
+    Args:
+        entity_id_list (list): A list of entity_ids. Crosstabs will
+                               be calculated using only predictions and
+                               features of these entities. This list is
+                               being passed to get_predictions_query and
+                               get_features_query.
+        crosstab_functions (dict): A dictionary of function names and
+                               functions. The functions receive the feature
+                               dataframes for entities that were predicted
+                               postive or negative, respectively. Each function
+                               returns a Series or DataFrame with rows now
+                               indexed by the feature names, and columns
+                               corresponding to the statistics that the
+                               function calculated.
+        thresholds (dict): A dictionary that maps column names to lists of
+                           threshold values. The (name, threshold) pairs will
+                           be applied to the dataframes with predictions to
+                           split low- and high-risk entities.
+        push_to_db (bool): If True, the crosstab table is being populated.
+        return_df (bool): If True, a dataframe corresponding to the crosstabe
+                          table will be returned.
+        engine (SQLAlchemy engine): DB connection
+        schema (str): Name of the schema to store results; defaults to 'results'
+        table (str): Name of table to store results; defaults to 'crosstabs'
+
+        Returns: A DataFrame, corresponding to results.crosstabs
     """
-    print("\n\n****************\nRUNNING populate_crosstabs_table for model_id ", model_id, "and as_of_date ", as_of_date)
+    logger.info(
+        f"RUNNING populate_crosstabs_table for model_id {model_id} and as_of_date {as_of_date}",
+    )
     if len(df) == 0:
         raise ValueError("No data could be fetched.")
 
     dfs = []
 
     # this will be useful later - the non-feature columns we'll grab
-    model_cols = ["model_id", "as_of_date", "entity_id", "score", "rank_abs", "rank_pct", "label_value"]
+    model_cols = [
+        "model_id",
+        "as_of_date",
+        "entity_id",
+        "score",
+        "rank_abs",
+        "rank_pct",
+        "label_value",
+    ]
 
     # drop the all-null columns
     null_cols = df.columns[df.isnull().sum() == df.shape[0]]
@@ -132,27 +146,31 @@ def populate_crosstabs_table(
 
     # if the dataframe isn't dummified, do it
     if object in df.dtypes.values or pd.Categorical.dtype in df.dtypes.values:
-        to_dummify = [c for c in df.select_dtypes(include=["category", object]) if c not in model_cols]
-        print("Dummifying the data")
+        to_dummify = [
+            c
+            for c in df.select_dtypes(include=["category", object])
+            if c not in model_cols
+        ]
+        logger.debug("Dummifying the data")
         df = pd.get_dummies(df, columns=to_dummify, dummy_na=True)
 
     feat_cols = [c for c in df.columns if c not in model_cols]
-    print("Iterating over thresholds to generate results...")
+    logger.debug("Iterating over thresholds to generate results...")
     for thres_type, thress in thresholds.items():
         for thres in thress:
 
             results = pd.DataFrame(index=feat_cols)
-            print("split dataframe in high risk and lowriks")
+            logger.debug("split dataframe in high risk and lowriks")
             # split dataframe into high/low risk
             df_pred_pos = df.loc[df[thres_type] <= thres, feat_cols]
             df_pred_neg = df.loc[df[thres_type] > thres, feat_cols]
-            print("len of hr and lr", len(df_pred_pos), len(df_pred_neg))
+            logger.debug(f"len of hr: {len(df_pred_pos)} and lr: {len(df_pred_neg)}")
             for name, func in crosstab_functions:
-                print(name)
+                logger.debug(name)
                 this_result = pd.DataFrame(func(df_pred_pos, df_pred_neg))
                 if name in ["ttest_T", "ttest_p"]:
-                    print("this_result:", this_result.shape)
-                    print("Results:", results.shape)
+                    logger.debug("this_result:", this_result.shape)
+                    logger.debug("Results:", results.shape)
                 if not type(name) in [list, tuple]:
                     name = [name]
                 # the metric name is coming from the crosstab_functions tuples
@@ -173,8 +191,10 @@ def populate_crosstabs_table(
 
     df = pd.concat(dfs)
     if push_to_db:
-        print("Pushing results to database...")
-        df.reset_index().set_index(["model_id", "as_of_date", "metric"]).to_sql(schema=schema, name=table, con=engine, if_exists="append")
+        logger.debug("Pushing results to database...")
+        df.reset_index().set_index(["model_id", "as_of_date", "metric"]).pg_copy_to(
+            schema=schema, name=table, con=engine, if_exists="append"
+        )
     if return_df:
         return df
 
@@ -202,7 +222,9 @@ def run_crosstabs(db_engine, crosstabs_config):
         predictions_query=crosstabs_config.predictions_query,
     )
     if len(crosstabs_config.entity_id_list) > 0:
-        crosstabs_query += " where entity_id=ANY('{%s}') " % ", ".join(map(str, crosstabs_config.entity_id_list))
+        crosstabs_query += " where entity_id=ANY('{%s}') " % ", ".join(
+            map(str, crosstabs_config.entity_id_list)
+        )
     crosstabs_query += "  order by model_id, as_of_date, rank_abs asc;"
     df = pd.read_sql(crosstabs_query, db_engine)
     if len(df) == 0:
diff --git a/src/triage/experiments/base.py b/src/triage/experiments/base.py
index 4bb7aa169..f48e05410 100644
--- a/src/triage/experiments/base.py
+++ b/src/triage/experiments/base.py
@@ -151,6 +151,7 @@ def __init__(
 
         self._check_config_version(config)
         self.config = config
+
         if self.config.get("cohort_config") is not None:
             self.config["cohort_config"] = load_query_if_needed(
                 self.config["cohort_config"]
@@ -897,7 +898,7 @@ def train_and_test_models(self):
             experiment.models_needed = len(model_hashes)
         record_model_building_started(self.run_id, self.db_engine)
         self.process_train_test_batches(batches)
-        logger.success("Training, testing and evaluatiog models completed")
+        logger.success("Training, testing and evaluating models completed")
 
     def validate(self, strict=True):
         ExperimentValidator(self.db_engine, strict=strict).run(self.config)