Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix sorting of dataframe for aequitas calculations during evaluations #858

Merged
merged 4 commits into from
Aug 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions src/tests/catwalk_tests/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import pandas as pd
from sqlalchemy.sql.expression import text
from triage.component.catwalk.utils import filename_friendly_hash, get_subset_table_name
from triage.component.catwalk.storage import MatrixStore
from tests.utils import fake_labels, fake_trained_model, MockMatrixStore
from tests.results_tests.factories import (
ModelFactory,
Expand Down Expand Up @@ -751,6 +752,82 @@ def test_evaluation_with_protected_df(db_engine_with_results_schema):
assert record['attribute_value'] == 'value1'


def test_evaluation_sorting_with_protected_df(db_engine_with_results_schema):
# Test that if a protected_df is passed (along with bias config, the only real needed one
# being threshold info), an Aequitas report is written to the database.
model_evaluator = ModelEvaluator(
testing_metric_groups=[
{
"metrics": ["precision@"],
"thresholds": {"top_n": [3]},
},
],
training_metric_groups=[],
bias_config={
'thresholds': {'top_n': [2]}
},
db_engine=db_engine_with_results_schema,
)
testing_labels = np.array([1, 1, 1, 0, 1])
testing_prediction_probas = np.array([0.56, 0.55, 0.92, 0.85, 0.24])

fake_test_matrix_store = MockMatrixStore(
"test", "1234", 5, db_engine_with_results_schema,
metadata_overrides={'as_of_times': [TRAIN_END_TIME]},
matrix=pd.DataFrame.from_dict(
{
"entity_id": [1, 2, 3, 4, 5],
"as_of_date": [pd.Timestamp(2016, 1, 1)]*5,
"feature_one": [3, 4, 3, 4, 3],
"feature_two": [5, 6, 5, 6, 5],
"label": testing_labels,
}
).set_index(MatrixStore.indices),
init_labels=pd.DataFrame(
{
"label_value": testing_labels,
"entity_id": [1, 2, 3, 4, 5],
"as_of_date": [pd.Timestamp(2016, 1, 1)]*5,
}
).set_index(["entity_id", "as_of_date"]).label_value,
init_as_of_dates=[TRAIN_END_TIME]
)

trained_model, model_id = fake_trained_model(
db_engine_with_results_schema,
train_end_time=TRAIN_END_TIME,
)

protected_df = pd.DataFrame({
# "entity_id": fake_test_matrix_store.design_matrix.index.levels[0].tolist(),
# "as_of_date": fake_test_matrix_store.design_matrix.index.levels[1].tolist(),
"protectedattribute1": ["low", "low", "low", "high", "high"]
}, index=fake_test_matrix_store.design_matrix.index)
# should be low has 3 records, all 1's; high has 2 records, one 1

expected = {
"low": {"group_size": 3, "group_label_neg": 0, "group_label_pos": 3},
"high": {"group_size": 2, "group_label_neg": 1, "group_label_pos": 1}
}

model_evaluator.evaluate(
testing_prediction_probas, fake_test_matrix_store, model_id, protected_df
)

for record in db_engine_with_results_schema.execute(
"""select * from test_results.aequitas
where model_id = %s and evaluation_start_time = %s
order by 1""",
(model_id, fake_test_matrix_store.as_of_dates[0]),
):
assert record['model_id'] == model_id
assert record['parameter'] == '2_abs'
assert record['attribute_name'] == 'protectedattribute1'
for col, value in expected[record['attribute_value']].items():
assert record[col] == value



def test_generate_binary_at_x():
input_array = np.array(
[0.9, 0.8, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.6])
Expand Down
25 changes: 15 additions & 10 deletions src/tests/catwalk_tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,43 +114,48 @@ def test_missing_matrix_uuids():

def test_sort_predictions_and_labels():
predictions = np.array([0.5, 0.4, 0.6, 0.5, 0.6])
entities = np.array(range(6))
entities = np.array(range(5))
labels = np.array([0, 0, 1, 1, None])

# best sort
sorted_predictions, sorted_labels = sort_predictions_and_labels(
predictions, labels, tiebreaker='best'
sorted_predictions, sorted_labels, sorted_entities = sort_predictions_and_labels(
predictions, labels, entities, tiebreaker='best'
)
assert_array_equal(sorted_predictions, np.array([0.6, 0.6, 0.5, 0.5, 0.4]))
assert_array_equal(sorted_labels, np.array([1, None, 1, 0, 0]))
assert_array_equal(sorted_entities.to_numpy(), np.array([2, 4, 3, 0, 1]))

# worst wort
sorted_predictions, sorted_labels = sort_predictions_and_labels(
predictions, labels, tiebreaker='worst'
# worst sort
sorted_predictions, sorted_labels, sorted_entities = sort_predictions_and_labels(
predictions, labels, entities, tiebreaker='worst'
)
assert_array_equal(sorted_predictions, np.array([0.6, 0.6, 0.5, 0.5, 0.4]))
assert_array_equal(sorted_labels, np.array([None, 1, 0, 1, 0]))
assert_array_equal(sorted_entities.to_numpy(), np.array([4, 2, 0, 3, 1]))

# random tiebreaker needs a seed
with pytest.raises(ValueError):
sort_predictions_and_labels(predictions, labels, tiebreaker='random')
sort_predictions_and_labels(predictions, labels, entities, tiebreaker='random')

# random tiebreaker respects the seed
sorted_predictions, sorted_labels = sort_predictions_and_labels(
sorted_predictions, sorted_labels, sorted_entities = sort_predictions_and_labels(
predictions,
labels,
entities,
tiebreaker='random',
sort_seed=1234
)
assert_array_equal(sorted_predictions, np.array([0.6, 0.6, 0.5, 0.5, 0.4]))
assert_array_equal(sorted_labels, np.array([None, 1, 1, 0, 0]))
assert_array_equal(sorted_entities.to_numpy(), np.array([4, 2, 3, 0, 1]))


sorted_predictions, sorted_labels = sort_predictions_and_labels(
sorted_predictions, sorted_labels, sorted_entities = sort_predictions_and_labels(
predictions,
labels,
entities,
tiebreaker='random',
sort_seed=24376234
)
assert_array_equal(sorted_predictions, np.array([0.6, 0.6, 0.5, 0.5, 0.4]))
assert_array_equal(sorted_labels, np.array([None, 1, 0, 1, 0]))
assert_array_equal(sorted_entities.to_numpy(), np.array([4, 2, 0, 3, 1]))
21 changes: 14 additions & 7 deletions src/triage/component/catwalk/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,8 +512,6 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
name for the subset to evaluate on, if any
protected_df (pandas.DataFrame) A dataframe with protected group attributes
"""
if (protected_df is not None) and (not protected_df.empty):
protected_df = protected_df.align(matrix_store.labels, join="inner", axis=0)[0]
# If we are evaluating on a subset, we want to get just the labels and
# predictions for the included entity-date pairs
if subset:
Expand All @@ -534,6 +532,12 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
labels = matrix_store.labels
subset_hash = ""

# confirm protected_df and labels have same set and count of values
if (protected_df is not None) and (not protected_df.empty):
if (protected_df.index.shape != labels.index.shape) or (not protected_df.index.symmetric_difference(labels.index).empty):
raise ValueError("Mismatch between protected_df and labels indices")

df_index = labels.index
labels = np.array(labels)

matrix_type = matrix_store.matrix_type
Expand All @@ -542,9 +546,10 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
logger.spam(f"Found {len(metric_defs)} metric definitions total")

# 1. get worst sorting
predictions_proba_worst, labels_worst = sort_predictions_and_labels(
predictions_proba_worst, labels_worst, df_index_worst = sort_predictions_and_labels(
predictions_proba=predictions_proba,
labels=labels,
df_index=df_index,
tiebreaker='worst',
)
worst_lookup = {
Expand All @@ -555,9 +560,10 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
logger.debug(f'Predictions from {model_id} sorted by worst case scenario, i.e. all negative and NULL labels first')

# 2. get best sorting
predictions_proba_best, labels_best = sort_predictions_and_labels(
predictions_proba_best, labels_best, df_index_best = sort_predictions_and_labels(
predictions_proba=predictions_proba_worst,
labels=labels_worst,
df_index=df_index_worst,
tiebreaker='best',
)
best_lookup = {
Expand Down Expand Up @@ -588,9 +594,10 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
random_eval_accumulator = defaultdict(list)
for _ in range(0, SORT_TRIALS):
sort_seed = generate_python_random_seed()
predictions_proba_random, labels_random = sort_predictions_and_labels(
predictions_proba_random, labels_random, df_index_random = sort_predictions_and_labels(
predictions_proba=predictions_proba_worst,
labels=labels_worst,
df_index=df_index_worst,
tiebreaker='random',
sort_seed=sort_seed
)
Expand Down Expand Up @@ -647,7 +654,7 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
if protected_df is not None:
self._write_audit_to_db(
model_id=model_id,
protected_df=protected_df,
protected_df=protected_df.reindex(df_index_worst),
predictions_proba=predictions_proba_worst,
labels=labels_worst,
tie_breaker='worst',
Expand All @@ -658,7 +665,7 @@ def evaluate(self, predictions_proba, matrix_store, model_id, protected_df=None,
matrix_uuid=matrix_store.uuid)
self._write_audit_to_db(
model_id=model_id,
protected_df=protected_df,
protected_df=protected_df.reindex(df_index_best),
predictions_proba=predictions_proba_best,
labels=labels_best,
tie_breaker='best',
Expand Down
12 changes: 7 additions & 5 deletions src/triage/component/catwalk/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,25 +156,26 @@ def __iter__(self):

AVAILABLE_TIEBREAKERS = {'random', 'best', 'worst'}

def sort_predictions_and_labels(predictions_proba, labels, tiebreaker='random', sort_seed=None):
def sort_predictions_and_labels(predictions_proba, labels, df_index, tiebreaker='random', sort_seed=None):
"""Sort predictions and labels with a configured tiebreaking rule

Args:
predictions_proba (np.array) The predicted scores
labels (np.array) The numeric labels (1/0, not True/False)
df_index (pd.MultiIndex) Index (generally entity_id, as_of_date tuples) to be sorted with the labels/scores
tiebreaker (string) The tiebreaking method ('best', 'worst', 'random')
sort_seed (signed int) The sort seed. Needed if 'random' tiebreaking is picked.

Returns:
(tuple) (predictions_proba, labels), sorted
(tuple) (predictions_proba, labels, df_index), sorted
"""
if len(labels) == 0:
logger.notice("No labels present, skipping predictions sorting .")
return (predictions_proba, labels)
mask = None
return (predictions_proba, labels, df_index)

df = pd.DataFrame(predictions_proba, columns=["score"])
df['label_value'] = labels
df.set_index(df_index, inplace=True)


if tiebreaker == 'random':
Expand All @@ -194,7 +195,8 @@ def sort_predictions_and_labels(predictions_proba, labels, tiebreaker='random',

return [
df['score'].to_numpy(),
df['label_value'].to_numpy()
df['label_value'].to_numpy(),
df.index
]


Expand Down