dssg · shaycrk · Aug 26, 2021 · Aug 17, 2021 · Aug 17, 2021 · Aug 17, 2021
diff --git a/src/tests/catwalk_tests/test_evaluation.py b/src/tests/catwalk_tests/test_evaluation.py
@@ -16,6 +16,7 @@
 import pandas as pd
 from sqlalchemy.sql.expression import text
 from triage.component.catwalk.utils import filename_friendly_hash, get_subset_table_name
+from triage.component.catwalk.storage import MatrixStore
 from tests.utils import fake_labels, fake_trained_model, MockMatrixStore
 from tests.results_tests.factories import (
     ModelFactory,
@@ -751,6 +752,82 @@ def test_evaluation_with_protected_df(db_engine_with_results_schema):
         assert record['attribute_value'] == 'value1'
 
 
+def test_evaluation_sorting_with_protected_df(db_engine_with_results_schema):
+    # Test that if a protected_df is passed (along with bias config, the only real needed one
+    # being threshold info), an Aequitas report is written to the database.
+    model_evaluator = ModelEvaluator(
+        testing_metric_groups=[
+            {
+                "metrics": ["precision@"],
+                "thresholds": {"top_n": [3]},
+            },
+        ],
+        training_metric_groups=[],
+        bias_config={
+            'thresholds': {'top_n': [2]}
+        },
+        db_engine=db_engine_with_results_schema,
+    )
+    testing_labels = np.array([1, 1, 1, 0, 1])
+    testing_prediction_probas = np.array([0.56, 0.55, 0.92, 0.85, 0.24])
+
+    fake_test_matrix_store = MockMatrixStore(
+        "test", "1234", 5, db_engine_with_results_schema,
+        metadata_overrides={'as_of_times': [TRAIN_END_TIME]},
+        matrix=pd.DataFrame.from_dict(
+                {
+                    "entity_id": [1, 2, 3, 4, 5],
+                    "as_of_date": [pd.Timestamp(2016, 1, 1)]*5,
+                    "feature_one": [3, 4, 3, 4, 3],
+                    "feature_two": [5, 6, 5, 6, 5],
+                    "label": testing_labels,
+                }
+            ).set_index(MatrixStore.indices),
+        init_labels=pd.DataFrame(
+            {
+                "label_value": testing_labels,
+                "entity_id": [1, 2, 3, 4, 5],
+                "as_of_date": [pd.Timestamp(2016, 1, 1)]*5,
+            }
+        ).set_index(["entity_id", "as_of_date"]).label_value,
+        init_as_of_dates=[TRAIN_END_TIME]
+    )
+
+    trained_model, model_id = fake_trained_model(
+        db_engine_with_results_schema,
+        train_end_time=TRAIN_END_TIME,
+    )
+
+    protected_df = pd.DataFrame({
+        # "entity_id": fake_test_matrix_store.design_matrix.index.levels[0].tolist(),
+        # "as_of_date": fake_test_matrix_store.design_matrix.index.levels[1].tolist(),
+        "protectedattribute1": ["low", "low", "low", "high", "high"]
+    }, index=fake_test_matrix_store.design_matrix.index)
+    # should be low has 3 records, all 1's; high has 2 records, one 1
+
+    expected = {
+        "low": {"group_size": 3, "group_label_neg": 0, "group_label_pos": 3},
+        "high": {"group_size": 2, "group_label_neg": 1, "group_label_pos": 1}
+    }
+
+    model_evaluator.evaluate(
+        testing_prediction_probas, fake_test_matrix_store, model_id, protected_df
+    )
+
+    for record in db_engine_with_results_schema.execute(
+        """select * from test_results.aequitas
+        where model_id = %s and evaluation_start_time = %s
+        order by 1""",
+        (model_id, fake_test_matrix_store.as_of_dates[0]),
+    ):
+        assert record['model_id'] == model_id
+        assert record['parameter'] == '2_abs'
+        assert record['attribute_name'] == 'protectedattribute1'
+        for col, value in expected[record['attribute_value']].items():
+            assert record[col] == value
+
+
+
 def test_generate_binary_at_x():
     input_array = np.array(
         [0.9, 0.8, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.6])

diff --git a/src/tests/catwalk_tests/test_utils.py b/src/tests/catwalk_tests/test_utils.py
@@ -114,43 +114,48 @@ def test_missing_matrix_uuids():
 
 def test_sort_predictions_and_labels():
     predictions = np.array([0.5, 0.4, 0.6, 0.5, 0.6])
-    entities = np.array(range(6))
+    entities = np.array(range(5))
     labels = np.array([0, 0, 1, 1, None])
 
     # best sort
-    sorted_predictions, sorted_labels = sort_predictions_and_labels(
-        predictions, labels, tiebreaker='best'
+    sorted_predictions, sorted_labels, sorted_entities = sort_predictions_and_labels(
+        predictions, labels, entities, tiebreaker='best'
     )
     assert_array_equal(sorted_predictions, np.array([0.6, 0.6, 0.5, 0.5, 0.4]))
     assert_array_equal(sorted_labels, np.array([1, None, 1, 0, 0]))
+    assert_array_equal(sorted_entities.to_numpy(), np.array([2, 4, 3, 0, 1]))
 
-    # worst wort
-    sorted_predictions, sorted_labels = sort_predictions_and_labels(
-        predictions, labels, tiebreaker='worst'
+    # worst sort
+    sorted_predictions, sorted_labels, sorted_entities = sort_predictions_and_labels(
+        predictions, labels, entities, tiebreaker='worst'
     )
     assert_array_equal(sorted_predictions, np.array([0.6, 0.6, 0.5, 0.5, 0.4]))
     assert_array_equal(sorted_labels, np.array([None, 1, 0, 1, 0]))
+    assert_array_equal(sorted_entities.to_numpy(), np.array([4, 2, 0, 3, 1]))
 
     # random tiebreaker needs a seed
     with pytest.raises(ValueError):
-        sort_predictions_and_labels(predictions, labels, tiebreaker='random')
+        sort_predictions_and_labels(predictions, labels, entities, tiebreaker='random')
 
     # random tiebreaker respects the seed
-    sorted_predictions, sorted_labels = sort_predictions_and_labels(
+    sorted_predictions, sorted_labels, sorted_entities = sort_predictions_and_labels(
         predictions,
         labels,
+        entities,
         tiebreaker='random',
         sort_seed=1234
     )
     assert_array_equal(sorted_predictions, np.array([0.6, 0.6, 0.5, 0.5, 0.4]))
     assert_array_equal(sorted_labels, np.array([None, 1, 1, 0, 0]))
+    assert_array_equal(sorted_entities.to_numpy(), np.array([4, 2, 3, 0, 1]))
 
-
-    sorted_predictions, sorted_labels = sort_predictions_and_labels(
+    sorted_predictions, sorted_labels, sorted_entities = sort_predictions_and_labels(
         predictions,
         labels,
+        entities,
         tiebreaker='random',
         sort_seed=24376234
     )
     assert_array_equal(sorted_predictions, np.array([0.6, 0.6, 0.5, 0.5, 0.4]))
     assert_array_equal(sorted_labels, np.array([None, 1, 0, 1, 0]))
+    assert_array_equal(sorted_entities.to_numpy(), np.array([4, 2, 0, 3, 1]))
diff --git a/src/triage/component/catwalk/evaluation.py b/src/triage/component/catwalk/evaluation.py
@@ -512,8 +512,6 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
                 name for the subset to evaluate on, if any
             protected_df (pandas.DataFrame) A dataframe with protected group attributes
         """
-        if (protected_df is not None) and (not protected_df.empty):
-            protected_df = protected_df.align(matrix_store.labels, join="inner", axis=0)[0]
         # If we are evaluating on a subset, we want to get just the labels and
         # predictions for the included entity-date pairs
         if subset:
@@ -534,6 +532,12 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
             labels = matrix_store.labels
             subset_hash = ""
 
+        # confirm protected_df and labels have same set and count of values
+        if (protected_df is not None) and (not protected_df.empty):
+            if (protected_df.index.shape != labels.index.shape) or (not protected_df.index.symmetric_difference(labels.index).empty):
+                raise ValueError("Mismatch between protected_df and labels indices")
+
+        df_index = labels.index
         labels = np.array(labels)
 
         matrix_type = matrix_store.matrix_type
@@ -542,9 +546,10 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
         logger.spam(f"Found {len(metric_defs)} metric definitions total")
 
         # 1. get worst sorting
-        predictions_proba_worst, labels_worst = sort_predictions_and_labels(
+        predictions_proba_worst, labels_worst, df_index_worst = sort_predictions_and_labels(
             predictions_proba=predictions_proba,
             labels=labels,
+            df_index=df_index,
             tiebreaker='worst',
         )
         worst_lookup = {
@@ -555,9 +560,10 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
         logger.debug(f'Predictions from {model_id} sorted by worst case scenario, i.e. all negative and NULL labels first')
 
         # 2. get best sorting
-        predictions_proba_best, labels_best = sort_predictions_and_labels(
+        predictions_proba_best, labels_best, df_index_best = sort_predictions_and_labels(
             predictions_proba=predictions_proba_worst,
             labels=labels_worst,
+            df_index=df_index_worst,
             tiebreaker='best',
         )
         best_lookup = {
@@ -588,9 +594,10 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
         random_eval_accumulator = defaultdict(list)
         for _ in range(0, SORT_TRIALS):
             sort_seed = generate_python_random_seed()
-            predictions_proba_random, labels_random = sort_predictions_and_labels(
+            predictions_proba_random, labels_random, df_index_random = sort_predictions_and_labels(
                 predictions_proba=predictions_proba_worst,
                 labels=labels_worst,
+                df_index=df_index_worst,
                 tiebreaker='random',
                 sort_seed=sort_seed
             )
@@ -647,7 +654,7 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
         if protected_df is not None:
             self._write_audit_to_db(
                 model_id=model_id,
-                protected_df=protected_df,
+                protected_df=protected_df.reindex(df_index_worst),
                 predictions_proba=predictions_proba_worst,
                 labels=labels_worst,
                 tie_breaker='worst',
@@ -658,7 +665,7 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
                 matrix_uuid=matrix_store.uuid)
             self._write_audit_to_db(
                 model_id=model_id,
-                protected_df=protected_df,
+                protected_df=protected_df.reindex(df_index_best),
                 predictions_proba=predictions_proba_best,
                 labels=labels_best,
                 tie_breaker='best',

diff --git a/src/triage/component/catwalk/utils.py b/src/triage/component/catwalk/utils.py
@@ -156,25 +156,26 @@ def __iter__(self):
 
 AVAILABLE_TIEBREAKERS = {'random', 'best', 'worst'}
 
-def sort_predictions_and_labels(predictions_proba, labels, tiebreaker='random', sort_seed=None):
+def sort_predictions_and_labels(predictions_proba, labels, df_index, tiebreaker='random', sort_seed=None):
     """Sort predictions and labels with a configured tiebreaking rule
 
     Args:
         predictions_proba (np.array) The predicted scores
         labels (np.array) The numeric labels (1/0, not True/False)
+        df_index (pd.MultiIndex) Index (generally entity_id, as_of_date tuples) to be sorted with the labels/scores
         tiebreaker (string) The tiebreaking method ('best', 'worst', 'random')
         sort_seed (signed int) The sort seed. Needed if 'random' tiebreaking is picked.
 
     Returns:
-        (tuple) (predictions_proba, labels), sorted
+        (tuple) (predictions_proba, labels, df_index), sorted
     """
     if len(labels) == 0:
         logger.notice("No labels present, skipping predictions sorting .")
-        return (predictions_proba, labels)
-    mask = None
+        return (predictions_proba, labels, df_index)
 
     df = pd.DataFrame(predictions_proba, columns=["score"])
     df['label_value'] = labels
+    df.set_index(df_index, inplace=True)
 
 
     if tiebreaker == 'random':
@@ -194,7 +195,8 @@ def sort_predictions_and_labels(predictions_proba, labels, tiebreaker='random',
 
     return  [
         df['score'].to_numpy(),
-        df['label_value'].to_numpy()
+        df['label_value'].to_numpy(),
+        df.index
     ]