mmcdermott · pargaw · May 3, 2024 · May 4, 2024 · Jun 11, 2024 · Jun 11, 2024
diff --git a/EventStream/baseline/FT_task_baseline.py b/EventStream/baseline/FT_task_baseline.py
@@ -46,6 +46,7 @@ def load_flat_rep(
     do_update_if_missing: bool = True,
     task_df_name: str | None = None,
     do_cache_filtered_task: bool = True,
+    overwrite_cache_filtered_task: bool = False,
-    overwrite_cache_filtered_task: bool = False,
+    overwrite_cache_filtered_task: bool = False,
-    overwrite_cache_filtered_task: bool = False,
+    overwrite_cache_filtered_task: bool = False,
     subjects_included: dict[str, set[int]] | None = None,
 ) -> dict[str, pl.LazyFrame]:
     """Loads a set of flat representations from a passed dataset that satisfy the given constraints.
@@ -67,14 +68,16 @@ def load_flat_rep(
         do_update_if_missing: If `True`, then if any window sizes or features are missing, the function will
             try to update the stored flat representations to reflect these. If `False`, if information is
             missing, it will raise a `FileNotFoundError` instead.
-        task_df_name: If specified, the flat representations loaded will be (inner) joined against the task
+        task_df_name: If specified, the flat representations loaded will be joined against the task
             dataframe of this name on the columns ``"subject_id"`` and ``"end_time"`` (which will be renamed
             to ``"timestamp"``). This is to avoid needing to load the full dataset in flattened form into
             memory. This is also used as a cache key; if a pre-filtered dataset is written to disk at a
             specified path for this task, then the data will be loaded from there, rather than from the base
             dataset.
         do_cache_filtered_task: If `True`, the flat representations will, after being filtered to just the
             relevant rows for the task, be cached to disk for faster re-use.
+        overwrite_cache_filtered_task: If `True`, the flat representations will be regenerated. If `False`,
+            the cached file will be loaded if exists.
         subjects_included: A dictionary by split of the subjects to include in the task. Omitted splits are
             used wholesale.
 
@@ -170,7 +173,7 @@ def load_flat_rep(
                 if task_df_name is not None:
                     fn = fp.parts[-1]
                     cached_fp = task_window_dir / fn
-                    if cached_fp.is_file():
+                    if cached_fp.is_file() and not overwrite_cache_filtered_task:
                         df = pl.scan_parquet(cached_fp).select("subject_id", "timestamp", *window_features)
                         if subjects_included.get(sp, None) is not None:
                             subjects = list(set(subjects).intersection(subjects_included[sp]))
@@ -182,7 +185,12 @@ def load_flat_rep(
                 if task_df_name is not None:
                     filter_join_df = sp_join_df.select(join_keys).filter(pl.col("subject_id").is_in(subjects))
 
-                    df = df.join(filter_join_df, on=join_keys, how="inner")
+                    df = filter_join_df.join_asof(
+                        df,
+                        by="subject_id",
+                        on="timestamp",
+                        strategy="forward" if "-" in window_size else "backward",
+                    )
 
                     if do_cache_filtered_task:
                         cached_fp.parent.mkdir(exist_ok=True, parents=True)

diff --git a/EventStream/data/dataset_base.py b/EventStream/data/dataset_base.py
@@ -223,17 +223,17 @@ def build_event_and_measurement_dfs(
         all_events_and_measurements = []
         event_types = []
 
-        for df, schemas in schemas_by_df.items():
+        for df_name, schemas in schemas_by_df.items():
             all_columns = []
 
             all_columns.extend(itertools.chain.from_iterable(s.columns_to_load for s in schemas))
 
             try:
-                df = cls._load_input_df(df, all_columns, subject_id_col, subject_ids_map, subject_id_dtype)
+                df = cls._load_input_df(df_name, all_columns, subject_id_col, subject_ids_map, subject_id_dtype)
             except Exception as e:
-                raise ValueError(f"Errored while loading {df}") from e
+                raise ValueError(f"Errored while loading {df_name}") from e
 
-            for schema in schemas:
+            for schema in tqdm(schemas, desc=f"Processing events and measurements df for {df_name.split('/')[-1]}"):
                 if schema.filter_on:
                     df = cls._filter_col_inclusion(schema.filter_on)
                 match schema.type:
@@ -266,7 +266,7 @@ def build_event_and_measurement_dfs(
 
         all_events, all_measurements = [], []
         running_event_id_max = 0
-        for event_type, (events, measurements) in zip(event_types, all_events_and_measurements):
+        for event_type, (events, measurements) in tqdm(zip(event_types, all_events_and_measurements), desc="Incrementing and combining events and measurements"):
             try:
                 new_events = cls._inc_df_col(events, "event_id", running_event_id_max)
             except Exception as e:

diff --git a/EventStream/data/dataset_polars.py b/EventStream/data/dataset_polars.py
@@ -705,7 +705,7 @@ def _update_subject_event_properties(self):
             )
 
             n_events_pd = self.events_df.get_column("subject_id").value_counts(sort=False).to_pandas()
-            self.n_events_per_subject = n_events_pd.set_index("subject_id")["counts"].to_dict()
+            self.n_events_per_subject = n_events_pd.set_index("subject_id")["count"].to_dict()
             self.subject_ids = set(self.n_events_per_subject.keys())
 
         if self.subjects_df is not None:
@@ -853,7 +853,7 @@ def _add_inferred_val_types(
                 .alias("is_int")
             )
             int_keys = for_val_type_inference.groupby(vocab_keys_col).agg(is_int_expr)
-
+             
             measurement_metadata = measurement_metadata.join(int_keys, on=vocab_keys_col, how="outer")
 
             key_is_int = pl.col(vocab_keys_col).is_in(int_keys.filter("is_int")[vocab_keys_col])
@@ -1105,7 +1105,7 @@ def _fit_vocabulary(self, measure: str, config: MeasurementConfig, source_df: DF
             try:
                 value_counts = observations.value_counts()
                 vocab_elements = value_counts.get_column(measure).to_list()
-                el_counts = value_counts.get_column("counts")
+                el_counts = value_counts.get_column("count")
                 return Vocabulary(vocabulary=vocab_elements, obs_frequencies=el_counts)
             except AssertionError as e:
                 raise AssertionError(f"Failed to build vocabulary for {measure}") from e
@@ -1417,7 +1417,8 @@ def _summarize_static_measurements(
         if include_only_subjects is None:
             df = self.subjects_df
         else:
-            df = self.subjects_df.filter(pl.col("subject_id").is_in(list(include_only_subjects)))
+            self.subjects_df = self.subjects_df.with_columns(pl.col("subject_id").cast(pl.Utf8))
+            df = self.subjects_df.filter(pl.col("subject_id").is_in([str(id) for id in include_only_subjects]))
 
         valid_measures = {}
         for feat_col in feature_columns:
@@ -1477,7 +1478,8 @@ def _summarize_time_dependent_measurements(
         if include_only_subjects is None:
             df = self.events_df
         else:
-            df = self.events_df.filter(pl.col("subject_id").is_in(list(include_only_subjects)))
+            self.events_df = self.events_df.with_columns(pl.col("subject_id").cast(pl.Utf8))
+            df = self.events_df.filter(pl.col("subject_id").is_in([str(id) for id in include_only_subjects]))
 
         valid_measures = {}
         for feat_col in feature_columns:
@@ -1540,8 +1542,9 @@ def _summarize_dynamic_measurements(
         if include_only_subjects is None:
             df = self.dynamic_measurements_df
         else:
+            self.events_df = self.events_df.with_columns(pl.col("subject_id").cast(pl.Utf8))
             df = self.dynamic_measurements_df.join(
-                self.events_df.filter(pl.col("subject_id").is_in(list(include_only_subjects))).select(
+                self.events_df.filter(pl.col("subject_id").is_in([str(id) for id in include_only_subjects])).select(
                     "event_id"
                 ),
                 on="event_id",

diff --git a/scripts/build_flat_reps.py b/scripts/build_flat_reps.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+"""Builds a flat representation dataset given a hydra config file."""
+
+try:
+    import stackprinter
+
+    stackprinter.set_excepthook(style="darkbg2")
+except ImportError:
+    pass  # no need to fail because of missing dev dependency
+
+from pathlib import Path
+import hydra
+from omegaconf import DictConfig, OmegaConf
-from omegaconf import DictConfig, OmegaConf
+from omegaconf import DictConfig
-from omegaconf import DictConfig, OmegaConf
+from omegaconf import DictConfig
+from loguru import logger
+
+from EventStream.data.dataset_polars import Dataset
+
+@hydra.main(version_base=None, config_path="../configs", config_name="dataset_base")
+def main(cfg: DictConfig):
+    cfg = hydra.utils.instantiate(cfg, _convert_="all")
+    save_dir = Path(cfg.pop("save_dir"))
+    window_sizes = cfg.pop("window_sizes")
+    subjects_per_output_file = cfg.pop("subjects_per_output_file") if 'subjects_per_output_file' in cfg else None
+
+    # Build flat reps for specified task and window sizes
+    logger.debug('Loading ESD..')
+    ESD = Dataset.load(save_dir)
+    feature_inclusion_frequency, include_only_measurements = ESD._resolve_flat_rep_cache_params(
+        feature_inclusion_frequency=None, include_only_measurements=None
+    )
+    cache_kwargs = dict(
+        subjects_per_output_file=subjects_per_output_file,
+        feature_inclusion_frequency=feature_inclusion_frequency, #0.1
+        window_sizes=window_sizes,
+        include_only_measurements=include_only_measurements,
+        do_overwrite=False,
+        do_update=True,
+    )
+    logger.debug('Caching flat representation..')
+    ESD.cache_flat_representation(**cache_kwargs)
+    logger.debug('Done')
+
+if __name__ == "__main__":
+    main()