Merge pull request #175 from mmcdermott/dev

Release Candidate 0.0.6
mmcdermott · Aug 27, 2024 · 9549d7e · 9549d7e
2 parents e7ed727 + 3f73a35
commit 9549d7e
Show file tree

Hide file tree

Showing 41 changed files with 2,817 additions and 400 deletions.
diff --git a/.github/workflows/code-quality-main.yaml b/.github/workflows/code-quality-main.yaml
@@ -11,14 +11,17 @@ jobs:
   code-quality:
     runs-on: ubuntu-latest
 
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+
     steps:
-      - name: Checkout
-        uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
-      - name: Set up Python 3.12
-        uses: actions/setup-python@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
         with:
-          python-version: "3.12"
+          python-version: ${{ matrix.python-version }}
 
       - name: Run pre-commits
         uses: pre-commit/action@v3.0.1
diff --git a/.github/workflows/code-quality-pr.yaml b/.github/workflows/code-quality-pr.yaml
@@ -14,14 +14,17 @@ jobs:
   code-quality:
     runs-on: ubuntu-latest
 
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+
     steps:
-      - name: Checkout
-        uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
-      - name: Set up Python 3.12
-        uses: actions/setup-python@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
         with:
-          python-version: "3.12"
+          python-version: ${{ matrix.python-version }}
 
       - name: Find modified files
         id: file_changes

diff --git a/.github/workflows/python-build.yaml b/.github/workflows/python-build.yaml
@@ -7,12 +7,16 @@ jobs:
     name: Build distribution 📦
     runs-on: ubuntu-latest
 
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+
     steps:
       - uses: actions/checkout@v4
-      - name: Set up Python
-        uses: actions/setup-python@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
         with:
-          python-version: "3.12"
+          python-version: ${{ matrix.python-version }}
       - name: Install pypa/build
         run: >-
           python3 -m

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -11,18 +11,20 @@ jobs:
     runs-on: ubuntu-latest
 
     strategy:
+      matrix:
+        python-version: ["3.11", "3.12"]
       fail-fast: false
 
     timeout-minutes: 30
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
-      - name: Set up Python 3.12
-        uses: actions/setup-python@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
         with:
-          python-version: "3.12"
+          python-version: ${{ matrix.python-version }}
 
       - name: Install packages
         run: |

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -45,7 +45,7 @@ repos:
     rev: v3.10.1
     hooks:
       - id: pyupgrade
-        args: [--py310-plus]
+        args: [--py311-plus]
 
   # python docstring formatting
   - repo: https://github.com/myint/docformatter

diff --git a/MIMIC-IV_Example/README.md b/MIMIC-IV_Example/README.md
@@ -13,7 +13,7 @@ pypi installation, which is covered below, so make sure you are in a suitable di
 ```bash
 conda create -n MEDS python=3.12
 conda activate MEDS
-pip install MEDS_transforms[examples,local_parallelism]
+pip install "MEDS_transforms[local_parallelism]"
 mkdir MIMIC-IV_Example
 cd MIMIC-IV_Example
 wget https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/main/MIMIC-IV_Example/joint_script.sh
@@ -32,7 +32,7 @@ git clone git@github.com:mmcdermott/MEDS_transforms.git
 cd MEDS_transforms
 conda create -n MEDS python=3.12
 conda activate MEDS
-pip install .[examples,local_parallelism]
+pip install .[local_parallelism]
 ```
 
 ## Step 1: Download MIMIC-IV

diff --git a/README.md b/README.md
@@ -1,7 +1,15 @@
 # MEDS Transforms
 
-[![codecov](https://codecov.io/gh/mmcdermott/MEDS_transforms/graph/badge.svg?token=5RORKQOZF9)](https://codecov.io/gh/mmcdermott/MEDS_transforms)
+[![PyPI - Version](https://img.shields.io/pypi/v/MEDS-transforms)](https://pypi.org/project/MEDS-transforms/)
+![python](https://img.shields.io/badge/-Python_3.12-blue?logo=python&logoColor=white)
 [![Documentation Status](https://readthedocs.org/projects/meds-transforms/badge/?version=latest)](https://meds-transforms.readthedocs.io/en/latest/?badge=latest)
+[![codecov](https://codecov.io/gh/mmcdermott/MEDS_transforms/graph/badge.svg?token=5RORKQOZF9)](https://codecov.io/gh/mmcdermott/MEDS_transforms)
+[![tests](https://github.com/mmcdermott/MEDS_transforms/actions/workflows/tests.yaml/badge.svg)](https://github.com/mmcdermott/MEDS_transforms/actions/workflows/tests.yml)
+[![code-quality](https://github.com/mmcdermott/MEDS_transforms/actions/workflows/code-quality-main.yaml/badge.svg)](https://github.com/mmcdermott/MEDS_transforms/actions/workflows/code-quality-main.yaml)
+[![hydra](https://img.shields.io/badge/Config-Hydra_1.3-89b8cd)](https://hydra.cc/)
+[![license](https://img.shields.io/badge/License-MIT-green.svg?labelColor=gray)](https://github.com/mmcdermott/MEDS_transforms#license)
+[![PRs](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/mmcdermott/MEDS_transforms/pulls)
+[![contributors](https://img.shields.io/github/contributors/mmcdermott/MEDS_transforms.svg)](https://github.com/mmcdermott/MEDS_transforms/graphs/contributors)
 
 This repository contains a set of functions and scripts for extraction to and transformation/pre-processing of
 MEDS-formatted data.

diff --git a/eICU_Example/README.md b/eICU_Example/README.md
@@ -28,14 +28,20 @@ up from this one).
 
 ## Step 0: Installation
 
-Download this repository and install the requirements:
+Install the requirements and source the requisite scripts
 
 ```bash
-git clone git@github.com:mmcdermott/MEDS_transforms.git
-cd MEDS_transforms
 conda create -n MEDS python=3.12
 conda activate MEDS
-pip install .[examples]
+pip install "MEDS_transforms[local_parallelism]"
+mkdir eICU_Example
+cd eICU_Example
+wget https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/main/eICU_Example/joint_script.sh
+wget https://raw.githubusercontent.com/mmcdermott/MEDS_transforms/main/eICU_Example/pre_MEDS.py
+chmod +x joint_script.sh
+chmod +x joint_script_slurm.sh
+chmod +x pre_MEDS.py
+cd ..
 ```
 
 ## Step 1: Download eICU

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,7 @@ authors = [
 ]
 description = "MEDS ETL and transformation functions leveraging a sharding-based parallelism model & polars."
 readme = "README.md"
-requires-python = ">=3.12"
+requires-python = ">=3.11"
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",
@@ -54,6 +54,7 @@ MEDS_transform-filter_patients = "MEDS_transforms.filters.filter_patients:main"
 ## Transforms
 MEDS_transform-reorder_measurements = "MEDS_transforms.transforms.reorder_measurements:main"
 MEDS_transform-add_time_derived_measurements = "MEDS_transforms.transforms.add_time_derived_measurements:main"
+MEDS_transform-extract_values = "MEDS_transforms.transforms.extract_values:main"
 MEDS_transform-normalization = "MEDS_transforms.transforms.normalization:main"
 MEDS_transform-occlude_outliers = "MEDS_transforms.transforms.occlude_outliers:main"
 MEDS_transform-tensorization = "MEDS_transforms.transforms.tensorization:main"

diff --git a/src/MEDS_transforms/__init__.py b/src/MEDS_transforms/__init__.py
@@ -31,3 +31,11 @@
     "timestamp": "time",
     "subject_id": "patient_id",
 }
+
+INFERRED_STAGE_KEYS = {
+    "is_metadata",
+    "data_input_dir",
+    "metadata_input_dir",
+    "output_dir",
+    "reducer_output_dir",
+}
diff --git a/src/MEDS_transforms/aggregate_code_metadata.py b/src/MEDS_transforms/aggregate_code_metadata.py
@@ -229,7 +229,7 @@ def validate_args_and_get_code_cols(stage_cfg: DictConfig, code_modifiers: list[
     for agg in aggregations:
         if isinstance(agg, (dict, DictConfig)):
             agg = agg.get("name", None)
-        if agg not in METADATA_FN:
+        if agg not in {fn.value for fn in METADATA_FN}:
             raise ValueError(
                 f"Metadata aggregation function {agg} not found in METADATA_FN enumeration. Values are: "
                 f"{', '.join([fn.value for fn in METADATA_FN])}"
@@ -406,7 +406,9 @@ def mapper_fntr(
         │ C    ┆ 1         ┆ 81.25          ┆ 5.0        ┆ 7.5        │
         │ D    ┆ null      ┆ 0.0            ┆ null       ┆ null       │
         └──────┴───────────┴────────────────┴────────────┴────────────┘
-        >>> stage_cfg = DictConfig({"aggregations": ["values/quantiles"]})
+        >>> stage_cfg = DictConfig({
+        ...     "aggregations": [{"name": "values/quantiles", "quantiles": [0.25, 0.5, 0.75]}]
+        ... })
         >>> mapper = mapper_fntr(stage_cfg, code_modifiers)
         >>> mapper(df.lazy()).collect().select("code", "modifier1", pl.col("values/quantiles"))
         shape: (5, 3)
@@ -421,6 +423,25 @@ def mapper_fntr(
         │ C    ┆ 1         ┆ [5.0, 7.5]       │
         │ D    ┆ null      ┆ []               │
         └──────┴───────────┴──────────────────┘
+        >>> stage_cfg = DictConfig({
+        ...     "aggregations": [{"name": "values/quantiles", "quantiles": [0.25, 0.5, 0.75]}],
+        ...     "do_summarize_over_all_codes": True,
+        ... })
+        >>> mapper = mapper_fntr(stage_cfg, code_modifiers)
+        >>> mapper(df.lazy()).collect().select("code", "modifier1", pl.col("values/quantiles"))
+        shape: (6, 3)
+        ┌──────┬───────────┬───────────────────┐
+        │ code ┆ modifier1 ┆ values/quantiles  │
+        │ ---  ┆ ---       ┆ ---               │
+        │ str  ┆ i64       ┆ list[f64]         │
+        ╞══════╪═══════════╪═══════════════════╡
+        │ null ┆ null      ┆ [1.1, 2.0, … 7.5] │
+        │ A    ┆ 1         ┆ [1.1, 1.1]        │
+        │ A    ┆ 2         ┆ [6.0]             │
+        │ B    ┆ 2         ┆ [2.0, 4.0]        │
+        │ C    ┆ 1         ┆ [5.0, 7.5]        │
+        │ D    ┆ null      ┆ []                │
+        └──────┴───────────┴───────────────────┘
     """
 
     code_key_columns = validate_args_and_get_code_cols(stage_cfg, code_modifiers)
@@ -435,15 +456,20 @@ def by_code_mapper(df: pl.LazyFrame) -> pl.LazyFrame:
         return df.group_by(code_key_columns).agg(**agg_operations).sort(code_key_columns)
 
     def all_patients_mapper(df: pl.LazyFrame) -> pl.LazyFrame:
-        return df.select(**agg_operations)
+        local_agg_operations = agg_operations.copy()
+        if METADATA_FN.VALUES_QUANTILES in agg_operations:
+            local_agg_operations[METADATA_FN.VALUES_QUANTILES] = agg_operations[
+                METADATA_FN.VALUES_QUANTILES
+            ].implode()
+        return df.select(**local_agg_operations)
 
     if stage_cfg.get("do_summarize_over_all_codes", False):
 
         def mapper(df: pl.LazyFrame) -> pl.LazyFrame:
             by_code = by_code_mapper(df)
             all_patients = all_patients_mapper(df)
             return pl.concat([all_patients, by_code], how="diagonal_relaxed").select(
-                *code_key_columns, *aggregations
+                *code_key_columns, *agg_operations.keys()
             )
 
     else:
@@ -682,6 +708,15 @@ def run_map_reduce(cfg: DictConfig):
         cs.numeric().shrink_dtype().name.keep()
     )
 
+    old_metadata_fp = Path(cfg.stage_cfg.metadata_input_dir) / "codes.parquet"
+    join_cols = ["code", *cfg.get("code_modifier_cols", [])]
+
+    if old_metadata_fp.exists():
+        logger.info(f"Joining to existing code metadata at {str(old_metadata_fp.resolve())}")
+        existing = pl.scan_parquet(old_metadata_fp)
+        existing = existing.drop(*[c for c in existing.columns if c in set(reduced.columns) - set(join_cols)])
+        reduced = reduced.join(existing, on=join_cols, how="left", coalesce=True)
+
     write_lazyframe(reduced, reducer_fp)
     logger.info(f"Finished reduction in {datetime.now() - start}")
 

diff --git a/src/MEDS_transforms/configs/extract.yaml b/src/MEDS_transforms/configs/extract.yaml
@@ -27,8 +27,6 @@ description: |-
 
 # The event conversion configuration file is used throughout the pipeline to define the events to extract.
 event_conversion_config_fp: ???
-# The code modifier columns are in this pipeline only used in the aggregate_code_metadata stage.
-code_modifiers: null
 # The shards mapping is stored in the root of the final output directory.
 shards_map_fp: "${cohort_dir}/metadata/.shards.json"
 
@@ -37,27 +35,10 @@ stages:
   - split_and_shard_patients
   - convert_to_sharded_events
   - merge_to_MEDS_cohort
-  - aggregate_code_metadata
   - extract_code_metadata
   - finalize_MEDS_metadata
   - finalize_MEDS_data
 
 stage_configs:
   shard_events:
     data_input_dir: "${input_dir}"
-  aggregate_code_metadata:
-    description: |-
-      This stage collects some descriptive metadata about the codes in the cohort.
-
-      Args:
-        stage_cfg.aggregations: The aggregations to compute over the codes.
-          Defaults to counts of code occurrences, counts of patients with the code, and counts of value
-          occurrences per code, as well as the sum and sum of squares of values (for use in computing means
-          and variances).
-    aggregations:
-      - "code/n_occurrences"
-      - "code/n_patients"
-      - "values/n_occurrences"
-      - "values/sum"
-      - "values/sum_sqd"
-    do_summarize_over_all_codes: true # This indicates we should include overall, code-independent counts
diff --git a/src/MEDS_transforms/configs/stage_configs/fit_vocabulary_indices.yaml b/src/MEDS_transforms/configs/stage_configs/fit_vocabulary_indices.yaml
@@ -1,4 +1,3 @@
 fit_vocabulary_indices:
   is_metadata: true
   ordering_method: "lexicographic"
-  output_dir: "${cohort_dir}"
diff --git a/src/MEDS_transforms/extract/extract_code_metadata.py b/src/MEDS_transforms/extract/extract_code_metadata.py
@@ -364,6 +364,10 @@ def main(cfg: DictConfig):
     OmegaConf.save(event_conversion_cfg, partial_metadata_dir / "event_conversion_config.yaml")
 
     events_and_metadata_by_metadata_fp = get_events_and_metadata_by_metadata_fp(event_conversion_cfg)
+    if not events_and_metadata_by_metadata_fp:
+        logger.info("No _metadata blocks in the event_conversion_config.yaml found. Exiting...")
+        return
+
     event_metadata_configs = list(events_and_metadata_by_metadata_fp.items())
     random.shuffle(event_metadata_configs)
 

diff --git a/src/MEDS_transforms/extract/split_and_shard_patients.py b/src/MEDS_transforms/extract/split_and_shard_patients.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 import json
+import math
 from collections.abc import Sequence
 from pathlib import Path
 
@@ -13,15 +14,13 @@
 from MEDS_transforms.utils import stage_init
 
 
-def shard_patients[
-    SUBJ_ID_T
-](
+def shard_patients(
     patients: np.ndarray,
     n_patients_per_shard: int = 50000,
-    external_splits: dict[str, Sequence[SUBJ_ID_T]] | None = None,
+    external_splits: dict[str, Sequence[int]] | None = None,
     split_fracs_dict: dict[str, float] | None = {"train": 0.8, "tuning": 0.1, "held_out": 0.1},
     seed: int = 1,
-) -> dict[str, list[SUBJ_ID_T]]:
+) -> dict[str, list[int]]:
     """Shard a list of patients, nested within train/tuning/held-out splits.
 
     This function takes a list of patients and shards them into train/tuning/held-out splits, with the shards
@@ -72,7 +71,7 @@ def shard_patients[
         >>> shard_patients(patients, n_patients_per_shard=3, split_fracs_dict={'train': 0.5})
         Traceback (most recent call last):
             ...
-        ValueError: The sum of the split fractions must be equal to 1.
+        ValueError: The sum of the split fractions must be equal to 1. Got 0.5 through {'train': 0.5}.
         >>> shard_patients([1, 2], n_patients_per_shard=3)
         Traceback (most recent call last):
             ...
@@ -107,10 +106,15 @@ def shard_patients[
 
     splits = external_splits
 
+    splits_cover = sum(split_fracs_dict.values()) if split_fracs_dict else 0
+
     rng = np.random.default_rng(seed)
     if n_patients := len(patient_ids_to_split):
-        if sum(split_fracs_dict.values()) != 1:
-            raise ValueError("The sum of the split fractions must be equal to 1.")
+        if not math.isclose(splits_cover, 1):
+            raise ValueError(
+                f"The sum of the split fractions must be equal to 1. Got {splits_cover} "
+                f"through {split_fracs_dict}."
+            )
         split_names_idx = rng.permutation(len(split_fracs_dict))
         split_names = np.array(list(split_fracs_dict.keys()))[split_names_idx]
         split_fracs = np.array([split_fracs_dict[k] for k in split_names])

diff --git a/src/MEDS_transforms/fit_vocabulary_indices.py b/src/MEDS_transforms/fit_vocabulary_indices.py
@@ -135,8 +135,7 @@ def lexicographic_indices(code_metadata: pl.DataFrame, code_modifiers: list[str]
         ...     "modifier2": [None, None, None, None, 2,   1],
         ... })
         >>> code_modifiers = ["modifier1", "modifier2"]
-        >>> expr = lexicographic_indices(code_metadata, code_modifiers)
-        >>> code_metadata.with_columns(expr)
+        >>> lexicographic_indices(code_metadata, code_modifiers)
         shape: (6, 4)
         ┌──────┬───────────┬───────────┬──────────────────┐
         │ code ┆ modifier1 ┆ modifier2 ┆ code/vocab_index │