Remove support for non-entity_id grouping for features (#887)

* remove hard-coded reference to entity_id * remove groups feature key and bump config version to v8 * remove non-entity groups from tests
dssg · Mar 28, 2022 · 24c8019 · 24c8019
1 parent 873c3ca
commit 24c8019
Show file tree

Hide file tree

Showing 33 changed files with 171 additions and 359 deletions.
diff --git a/.python-version.current b/.python-version.current
@@ -1 +1 @@
-triage-3.6.2
+triage-3.9.10
diff --git a/README.md b/README.md
@@ -25,7 +25,7 @@ Triage is designed to:
 
 To install Triage, you need:
 
-- Python 3.6
+- Python 3.7+
 - A PostgreSQL 9.6+ database with your source data (events,
   geographical data, etc) loaded.
   - **NOTE**: If your database is PostgreSQL 11+ you will get some
@@ -35,7 +35,7 @@ To install Triage, you need:
   Services's S3), to store the needed matrices and models for your
   experiments
 
-We recommend starting with a new python virtual environment (with Python 3.6 or greater) and pip installing triage there.
+We recommend starting with a new python virtual environment and pip installing triage there.
 ```bash
 $ virtualenv triage-env
 $ . triage-env/bin/activate
@@ -106,7 +106,7 @@ example:
 
     (pyenv) installed
 
-    (python-3.6.2) installed
+    (python-3.9.10) installed
 
     (virtualenv) installed
 

diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -103,6 +103,7 @@ nav:
           - Testing Feature Configuration: experiments/feature-testing.md
           - Running an Experiment: experiments/running.md
           - Upgrading an Experiment:
+              - v7 -> v8: experiments/upgrade-to-v8.md
               - v6 -> v7: experiments/upgrade-to-v7.md
               - v5 -> v6: experiments/upgrade-to-v6.md
               - v3/v4 -> v5: experiments/upgrade-to-v5.md

diff --git a/docs/sources/dirtyduck/dirty_duckling.md b/docs/sources/dirtyduck/dirty_duckling.md
@@ -133,10 +133,12 @@ If you wish, you can check the content of the file with `cat
 experiments/dirty-ducking.yaml`
 
 ```yaml
-config_version: 'v7'
+config_version: 'v8'
 
 model_comment: 'dirtyduck-quickstart'
 
+random_seed: 1234
+
 temporal_config:
     label_timespans: ['3months']
 
@@ -170,9 +172,6 @@ feature_aggregations:
 
     intervals: ['all']
 
-    groups:
-      - 'entity_id'
-
 model_grid_preset:  'quickstart'
 
 scoring:

diff --git a/docs/sources/dirtyduck/eis.md b/docs/sources/dirtyduck/eis.md
@@ -74,7 +74,7 @@ First the usual stuff. Note that we are changing `model_comment` and
 *hash* that differentiates models and model groups).
 
 ```yaml
-config_version: 'v7'
+config_version: 'v8'
 
 model_comment: 'eis: 01'
 random_seed: 23895478
@@ -223,9 +223,6 @@ in [inspections prioritization](inspections.md):
 
         intervals: ['1month', '3month', '6month', '1y', 'all']
 
-        groups:
-          - 'entity_id'
-
       -
         prefix: 'risks'
         from_obj: 'semantic.events'
@@ -247,10 +244,6 @@ in [inspections prioritization](inspections.md):
 
         intervals: ['1month', '3month', '6month', '1y', 'all']
 
-        groups:
-          - 'entity_id'
-          - 'zip_code'
-
       -
         prefix: 'results'
         from_obj: 'semantic.events'
@@ -270,9 +263,6 @@ in [inspections prioritization](inspections.md):
 
         intervals: ['1month', '3month', '6month', '1y', 'all']
 
-        groups:
-          - 'entity_id'
-
       -
         prefix: 'inspection_types'
         from_obj: 'semantic.events'
@@ -291,9 +281,6 @@ in [inspections prioritization](inspections.md):
 
         intervals: ['1month', '3month', '6month', '1y', 'all']
 
-        groups:
-          - 'entity_id'
-          - 'zip_code'
 ```
 
 We specify that we want to use all possible feature-group combinations for training:
@@ -513,7 +500,7 @@ The only differences between this experiment config file and the
 previous are in the `user_metadata` section:
 
 ```yaml
-config_version: 'v7'
+config_version: 'v8'
 
 model_comment: 'eis: 02'
 random_seed: 23895478
@@ -942,8 +929,8 @@ models_dates_join_query: |
 #features_query must join models_dates_join_query with 1 or more features table using as_of_date
 features_query: |
   select m.model_id, m.as_of_date, f4.entity_id, f4.results_entity_id_1month_result_fail_avg, f4.results_entity_id_3month_result_fail_avg, f4.results_entity_id_6month_result_fail_avg,
-  f2.inspection_types_zip_code_1month_type_canvass_sum, f3.risks_zip_code_1month_risk_high_sum, f4.results_entity_id_6month_result_pass_avg,
-  f3.risks_entity_id_all_risk_high_sum, f2.inspection_types_zip_code_3month_type_canvass_sum, f4.results_entity_id_6month_result_pass_sum,
+  f2.inspection_types_entity_id_1month_type_canvass_sum, f3.risks_entity_id_1month_risk_high_sum, f4.results_entity_id_6month_result_pass_avg,
+  f3.risks_entity_id_all_risk_high_sum, f2.inspection_types_entity_id_3month_type_canvass_sum, f4.results_entity_id_6month_result_pass_sum,
   f2.inspection_types_entity_id_all_type_canvass_sum
   from features.inspection_types_aggregation_imputed as f2
   inner join features.risks_aggregation_imputed as f3 using (entity_id, as_of_date)

diff --git a/docs/sources/dirtyduck/inspections.md b/docs/sources/dirtyduck/inspections.md
@@ -181,7 +181,7 @@ The config file for this first experiment is located in
 [inspections_baseline.yaml](https://github.com/dssg/triage/blob/master/example/dirtyduck/experiments/inspections_baseline.yaml).
 
 The first lines of the experiment config file specify the config-file
-version (`v7` at the moment of writing this tutorial), a comment
+version (`v8` at the moment of writing this tutorial), a comment
 (`model_comment`, which will end up as a value in the
 `triage_metadata.models` table), and a list of user-defined metadata
 (`user_metadata`) that can help to identify the resulting model
@@ -197,7 +197,7 @@ overwritten or incorrectly used), and if you add the
 different label definitions will belong to different model groups.
 
 ```yaml
-config_version: 'v7'
+config_version: 'v8'
 
 model_comment: 'inspections: baseline'
 random_seed: 23895478
@@ -371,9 +371,6 @@ feature_aggregations:
 
     intervals: ['all']
 
-    groups:
-      - 'entity_id'
-
 feature_group_definition:
    prefix:
      - 'inspections'
@@ -732,7 +729,7 @@ smart enough to use the previous tables and matrices instead of
 generating them from scratch.
 
 ```yaml
-config_version: 'v7'
+config_version: 'v8'
 
 model_comment: 'inspections: basic ML'
 
@@ -792,9 +789,6 @@ feature_aggregations:
 
     intervals: ['1month', '3month', '6month', '1y', 'all']
 
-    groups:
-      - 'entity_id'
-
   -
     prefix: 'risks'
     from_obj: 'semantic.events'
@@ -816,10 +810,6 @@ feature_aggregations:
 
     intervals: ['1month', '3month', '6month', '1y', 'all']
 
-    groups:
-      - 'entity_id'
-      - 'zip_code'
-
   -
     prefix: 'results'
     from_obj: 'semantic.events'
@@ -839,9 +829,6 @@ feature_aggregations:
 
     intervals: ['1month', '3month', '6month', '1y', 'all']
 
-    groups:
-      - 'entity_id'
-
   -
     prefix: 'inspection_types'
     from_obj: 'semantic.events'
@@ -860,10 +847,6 @@ feature_aggregations:
 
     intervals: ['1month', '3month', '6month', '1y', 'all']
 
-    groups:
-      - 'entity_id'
-      - 'zip_code'
-
 ```
 
 And as stated, we will train some Decision Trees, in particular we are
@@ -1177,7 +1160,7 @@ back to this problem in the Early Warning Systems.
 Ok, let's add a more complete experiment. First the usual generalities.
 
 ```yaml
-config_version: 'v7'
+config_version: 'v8'
 
 model_comment: 'inspections: advanced'