monarch-initiative · ielis · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/PhenopacketStoreStats.ipynb b/PhenopacketStoreStats.ipynb
diff --git a/matplotlibrc b/matplotlibrc
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,5 @@
+[pytest]
+; We want pytest to look for both test files prefixed with underscore `_` and "regular* test files.
+; The unittest files, the test files located in the source code, are prepended with underscore
+; to preclude their inclusion into API reference.
+python_files = _test*.py test*.py
diff --git a/src/ppktstore/__init__.py b/src/ppktstore/__init__.py
@@ -3,7 +3,7 @@
 from . import qc
 from . import stats
 
-__version__ = "0.1.15"
+__version__ = "0.1.16.dev0"
 
 __all__ = [
     "archive",

diff --git a/src/ppktstore/stats/__init__.py b/src/ppktstore/stats/__init__.py
@@ -1,11 +1,11 @@
 from ._ppkt_store_stats import PPKtStoreStats
-from ._summary import summarize_diseases_and_genotype, summarize_genotype_phenotype, summarize_cohort_sizes
+from ._summary import summarize_diseases_and_genotype, summarize_genotype_phenotype, summarize_cohort_sizes, summarize_individuals
 from ._markdown import generate_phenopacket_store_report
 
 
 __all__ = [
     "PPKtStoreStats",
     "summarize_diseases_and_genotype", 
-    "summarize_genotype_phenotype", "summarize_cohort_sizes",
+    "summarize_genotype_phenotype", "summarize_cohort_sizes", "summarize_individuals",
     "generate_phenopacket_store_report",
 ]
diff --git a/src/ppktstore/stats/_ppkt_store_stats.py b/src/ppktstore/stats/_ppkt_store_stats.py
@@ -2,7 +2,7 @@
 import re
 import typing
 
-from collections import defaultdict
+from collections import defaultdict, Counter
 
 import numpy as np
 import pandas as pd
@@ -56,6 +56,18 @@ def get_descriptive_stats(self) -> typing.Mapping[str, typing.Any]:
 
         stats_d["alleles"] = len(unique_alleles)
         stats_d["PMIDs"] = len(df["PMID"].unique())
+        df.groupby(['cohort', 'PMID']).size()
+
+        individuals_per_gene = list(genes_to_ppkt_d.values())
+        stats_d["individuals per gene (max)"] = np.max(individuals_per_gene)
+        stats_d["individuals per gene (min)"] = np.min(individuals_per_gene)
+        stats_d["individuals per gene (mean)"] = np.mean(individuals_per_gene)
+        stats_d["individuals per gene (median)"] = np.median(individuals_per_gene)
+       # stats_d["individuals per gene (n>=10)"] = len([x for x in individuals_per_gene if x >= 10])
+        #stats_d["individuals per gene (n>=20)"] = len([x for x in individuals_per_gene if x >= 20])
+        #stats_d["individuals per gene (n>=50)"] = len([x for x in individuals_per_gene if x >= 50])
+       # stats_d["individuals per gene (n>=100)"] = len([x for x in individuals_per_gene if x >= 100])
+
         individuals_per_disease = list(diseases_to_ppkt_id.values())
         stats_d["individuals per disease (max)"] = np.max(individuals_per_disease)
         stats_d["individuals per disease (min)"] = np.min(individuals_per_disease)
@@ -71,27 +83,23 @@ def get_descriptive_stats(self) -> typing.Mapping[str, typing.Any]:
         stats_d["genes associated with a single disease"] = len([x for x in genes_to_disease_d.keys() if len(genes_to_disease_d[x]) == 1])
         stats_d["genes associated with two diseases"] = len([x for x in genes_to_disease_d.keys() if len(genes_to_disease_d[x]) == 2])
         stats_d["genes associated with multiple diseases"] = len([x for x in genes_to_disease_d.keys() if len(genes_to_disease_d[x]) > 2])
-        max_gene = None
+        max_genes = set()
         max_count = 0
-        for k,v in genes_to_disease_d.items():
-            n_diseases = len(v)
-            if n_diseases > max_count:
-                max_count = n_diseases
-                max_gene = k
-        max_msg = f"{max_gene} with {max_count} associated diseases"
-        stats_d["gene with maximum number of diseases"] = max_msg
-        individuals_per_gene = list(genes_to_ppkt_d.values())
-        #stats_d["individuals per gene (max)"] = np.max(individuals_per_gene)
-       # stats_d["individuals per gene (min)"] = np.min(individuals_per_gene)
-        #stats_d["individuals per gene (mean)"] = np.mean(individuals_per_gene)
-        #stats_d["individuals per gene (median)"] = np.median(individuals_per_gene)
-       # stats_d["individuals per gene (n>=10)"] = len([x for x in individuals_per_gene if x >= 10])
-        #stats_d["individuals per gene (n>=20)"] = len([x for x in individuals_per_gene if x >= 20])
-        #stats_d["individuals per gene (n>=50)"] = len([x for x in individuals_per_gene if x >= 50])
-       # stats_d["individuals per gene (n>=100)"] = len([x for x in individuals_per_gene if x >= 100])
-        all_hpo_d = PPKtStoreStats._get_all_unique_hpo_d(self._store)      
-        stats_d["total HPO terms used"] = len(all_hpo_d)
-
+        for gene, diseases in genes_to_disease_d.items():
+            n_diseases = len(diseases)
+            if n_diseases >= max_count:
+                if n_diseases > max_count:
+                    max_count = n_diseases
+                    max_genes.clear()
+                max_genes.add(gene)
+        genes_text = ', '.join(sorted(max_genes))
+        max_msg = f"{genes_text} with {max_count} associated diseases"
+        stats_d["gene(s) with maximum number of diseases"] = max_msg
+        total_hpo_stats = PPKtStoreStats._get_total_and_unique_hpo_counts(self._store)      
+        stats_d.update(total_hpo_stats)
+
+        pf_summary = PPKtStoreStats._summarize_phenotypic_features(self._store)
+        stats_d.update(pf_summary)
         return stats_d
 
 
@@ -116,20 +124,74 @@ def get_counts_per_disease_df(self) -> pd.DataFrame:
         return pd.DataFrame(items, index=None)
 
     @staticmethod
-    def _get_all_unique_hpo_d(
+    def _get_total_and_unique_hpo_counts(
         phenopacket_store: PhenopacketStore,
-    ) -> typing.Mapping[str, str]:
-        """return a dictionary with key=HPO id, value=label of all HPOs used in phenopacket-store
+    ) -> typing.Mapping[str, float]:
         """
-        all_hpo_d = dict()
+        Return a tuple with the counts of all and distinct HPOs used in Phenopacket store.
+        """
+        n_total = 0
+        n_present = 0
+        n_excluded = 0
+        unique = set()
+
         for cohort_info in phenopacket_store.cohorts():
             for pp_info in cohort_info.phenopackets:
                 pp = pp_info.phenopacket
                 for pf in pp.phenotypic_features:
-                    all_hpo_d[pf.type.id] = pf.type.label
-        print(f"Got {len(all_hpo_d)} unique HPOs")
-        return all_hpo_d
+                    if pf.excluded:
+                        n_excluded += 1
+                    else:
+                        n_present += 1
+
+                    unique.add(pf.type.id)
+                    n_total += 1
+
+        return {
+            "total HPO terms used": n_total,
+            "present HPO terms used": n_present,
+            "excluded HPO terms used": n_excluded,
+            "unique HPO terms used": len(unique),
+        }
 
+    @staticmethod
+    def _summarize_phenotypic_features(
+        store: PhenopacketStore,
+    ) -> typing.Mapping[str, float]:
+        cohort2present = Counter()
+        cohort2excluded = Counter()
+        cohort2total = Counter()
+        for cohort_info in store.cohorts():
+            cohort_name = cohort_info.name
+            for pp_info in cohort_info.phenopackets:
+                pp = pp_info.phenopacket
+                cohort2present[cohort_name] += sum(1 for pf in pp.phenotypic_features if not pf.excluded)
+                cohort2excluded[cohort_name] += sum(1 for pf in pp.phenotypic_features if pf.excluded)
+                cohort2total[cohort_name] += len(pp.phenotypic_features)
+
+        c2p = list(cohort2present.values())
+        c2e = list(cohort2excluded.values())
+        c2t = list(cohort2total.values())
+
+        # TODO: this should be per sample
+
+        return {
+            'present HPO terms per cohort (mean)': float(np.mean(c2p)),
+            'present HPO terms per cohort (median)': float(np.median(c2p)),
+            'present HPO terms per cohort (min)': float(np.min(c2p)),
+            'present HPO terms per cohort (max)': float(np.max(c2p)),
+
+            'excluded HPO terms per cohort (mean)': float(np.mean(c2e)),
+            'excluded HPO terms per cohort (median)': float(np.median(c2e)),
+            'excluded HPO terms per cohort (min)': float(np.min(c2e)),
+            'excluded HPO terms per cohort (max)': float(np.max(c2e)),
+
+            'total HPO terms per cohort (mean)': float(np.mean(c2t)),
+            'total HPO terms per cohort (median)': float(np.median(c2t)),
+            'total HPO terms per cohort (min)': float(np.min(c2t)),
+            'total HPO terms per cohort (max)': float(np.max(c2t)),
+        }
+
 
     def check_disease_id(self):
         """
@@ -227,7 +289,7 @@ def get_disease_to_phenopacket_count_d(self) -> typing.Dict[str, int]:
         print(f"Extracted a total of {len(disease_to_ppkt_count_d)} diseases")
         return disease_to_ppkt_count_d
 
-    def _get_gene_symbol(self, ppkt: Phenopacket) -> str:
+    def _get_gene_symbol(self, ppkt: Phenopacket) -> typing.Optional[str]:
         if len(ppkt.interpretations) == 0:
             print(f"Warning: Got no interpretations for {ppkt.id}. Skipping")
             return None

diff --git a/src/ppktstore/stats/_summary.py b/src/ppktstore/stats/_summary.py
@@ -1,4 +1,5 @@
 import os
+import re
 import typing
 import warnings
 
@@ -7,14 +8,29 @@
 from collections import defaultdict
 
 from phenopackets.schema.v2.phenopackets_pb2 import Phenopacket
-from phenopackets.schema.v2.core.meta_data_pb2 import MetaData
+from phenopackets.schema.v2.core.base_pb2 import TimeElement, Age
 from phenopackets.schema.v2.core.interpretation_pb2 import Diagnosis
+from phenopackets.schema.v2.core.individual_pb2 import Sex, Individual, VitalStatus
+from phenopackets.schema.v2.core.meta_data_pb2 import MetaData
 
 from phenopackets.vrsatile.v1.vrsatile_pb2 import VariationDescriptor
 
 from ppktstore.model import PhenopacketStore
 
 
+iso_duration_pt = re.compile(
+    r"^P((?P<years>\d+)Y)?((?P<months>\d+)M)?((?P<days>\d+)D)?(T((?P<hours>\d+)H)?((?P<minutes>\d+)M)?((?P<seconds>\d+)S)?)?$"
+)
+unit_to_days = {
+    "years": 365.25,
+    "months": 30,
+    "days": 1,
+    "hours": 1 / 24,
+    "minutes": 1 / (24 * 60),
+    "seconds": 1 / (24 * 60 * 60),
+}
+
+
 def summarize_diseases_and_genotype(
     phenopacket_store: PhenopacketStore,
 ) -> pd.DataFrame:
@@ -52,7 +68,21 @@ def summarize_diseases_and_genotype(
                     data["cohort"].append(cohort.name)
                     data["filename"].append(pp_path)
 
-    return pd.DataFrame(data)
+    columns = pd.Index(
+        [
+            "patient_id",
+            "cohort",
+            "disease_id",
+            "disease",
+            "gene",
+            "allele_1",
+            "allele_2",
+            "PMID",
+            "filename",
+        ]
+    )
+
+    return pd.DataFrame(data, columns=columns)
 
 
 def summarize_genotype_phenotype(
@@ -126,6 +156,32 @@ def summarize_cohort_sizes(
     return pd.DataFrame(rows)
 
 
+def summarize_individuals(
+    store: PhenopacketStore,
+) -> pd.DataFrame:
+    data = defaultdict(list)
+    for cohort_info in store.cohorts():
+        for pp_info in cohort_info.phenopackets:
+            pp = pp_info.phenopacket
+            subject = pp.subject
+
+            data["id"].append(f"{pp.id}-{subject.id}")
+            data["sex"].append(Sex.Name(subject.sex))
+            n_days = _extract_age(subject)
+            data["age_in_days"].append(n_days)
+            data["age_in_years"].append(n_days / unit_to_days["years"])
+
+            vs = None
+            if subject.HasField("vital_status"):
+                vs = VitalStatus.Status.Name(subject.vital_status.status)
+            else:
+                vs = None
+            data["vital_status"].append(vs)
+
+    columns = pd.Index(["id", "sex", "age_in_days", "age_in_years", "vital_status"])
+    return pd.DataFrame(data=data, columns=columns)
+
+
 def _get_encounter_count(
     pp: Phenopacket,
 ) -> int:
@@ -149,7 +205,7 @@ def _get_pmid(
 
 
 def _get_gene_and_alleles(
-    diagnosis: Diagnosis
+    diagnosis: Diagnosis,
 ) -> typing.Tuple[typing.Optional[str], typing.Sequence[str]]:
     """
     Extract the gene symbol, failing that the label, for a variant from the Interpretation object.
@@ -185,7 +241,7 @@ def _get_gene_and_alleles(
                 genotype = var_desc.allelic_state
                 if genotype.label == "homozygous" and len(alleles) > 0:
                     alleles.append(alleles[0])
-    
+
     return gene, alleles
 
 
@@ -210,3 +266,34 @@ def _get_structural_var(
     else:
         raise ValueError(f"Could not find structural_type field")
     return alleles
+
+
+def _extract_age(
+    subject: Individual,
+) -> float:
+    if subject.HasField("time_at_last_encounter"):
+        tale = subject.time_at_last_encounter
+        if tale.HasField("age"):
+            age = tale.age
+            ageperiod = age.iso8601duration
+            if ageperiod is not None or ageperiod != "":
+                return _parse_to_days(ageperiod)
+
+    return float("nan")
+
+
+def _parse_to_days(
+    age_period: str,
+) -> float:
+    match = iso_duration_pt.match(age_period)
+    if match is None:
+        return float("nan")
+    else:
+        n_days = 0.0
+
+        for unit, multiplier in unit_to_days.items():
+            val = match.group(unit)
+            if val is not None:
+                n_days += float(val) * multiplier
+
+        return n_days
diff --git a/src/ppktstore/stats/_test_summary.py b/src/ppktstore/stats/_test_summary.py
@@ -0,0 +1,59 @@
+import typing
+
+import pytest
+
+
+from ._summary import iso_duration_pt, _parse_to_days
+
+
+@pytest.mark.parametrize(
+    "value, years, months, days, hours, minutes, seconds",
+    [
+        ("P3Y6M4DT12H30M5S", "3", "6", "4", "12", "30", "5"),
+        ("P3Y6M4D", "3", "6", "4", None, None, None),
+        ("P3Y", "3", None, None, None, None, None),
+        ("P6M", None, "6", None, None, None, None),
+        ("P4D", None, None, "4", None, None, None),
+    ],
+)
+def test_iso_duration_pt__ok_values(
+    value: str,
+    years: typing.Optional[str],
+    months: typing.Optional[str],
+    days: typing.Optional[str],
+    hours: typing.Optional[str],
+    minutes: typing.Optional[str],
+    seconds: typing.Optional[str],
+):
+    match = iso_duration_pt.match(value)
+
+    assert match is not None
+
+    assert match.group("years") == years
+    assert match.group("months") == months
+    assert match.group("days") == days
+    assert match.group("hours") == hours
+    assert match.group("minutes") == minutes
+    assert match.group("seconds") == seconds
+
+
+@pytest.mark.parametrize(
+    "value, expected",
+    [
+        ("P3Y6M4DT12H30M5S", 1280.27089,),
+        ("P3Y6M4D", 1279.75,),
+        ("P3Y", 1095.75,),
+        ("P3Y4M", 1215.75,),
+        ("P6M", 180.,),
+        ("P4D", 4.,),
+        ("PT12H", .5,),
+        ("PT1S", .00001157407,),
+    ],
+)
+def test__parse_to_days(
+    value: str,
+    expected: float,
+):
+    actual = _parse_to_days(value)
+
+    assert actual == pytest.approx(expected)