Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update stats for the manuscript #131

Merged
merged 4 commits into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
946 changes: 758 additions & 188 deletions PhenopacketStoreStats.ipynb

Large diffs are not rendered by default.

624 changes: 624 additions & 0 deletions matplotlibrc

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[pytest]
; We want pytest to look for both test files prefixed with underscore `_` and "regular* test files.
; The unittest files, the test files located in the source code, are prepended with underscore
; to preclude their inclusion into API reference.
python_files = _test*.py test*.py
2 changes: 1 addition & 1 deletion src/ppktstore/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from . import qc
from . import stats

__version__ = "0.1.15"
__version__ = "0.1.16.dev0"

__all__ = [
"archive",
Expand Down
4 changes: 2 additions & 2 deletions src/ppktstore/stats/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from ._ppkt_store_stats import PPKtStoreStats
from ._summary import summarize_diseases_and_genotype, summarize_genotype_phenotype, summarize_cohort_sizes
from ._summary import summarize_diseases_and_genotype, summarize_genotype_phenotype, summarize_cohort_sizes, summarize_individuals
from ._markdown import generate_phenopacket_store_report


__all__ = [
"PPKtStoreStats",
"summarize_diseases_and_genotype",
"summarize_genotype_phenotype", "summarize_cohort_sizes",
"summarize_genotype_phenotype", "summarize_cohort_sizes", "summarize_individuals",
"generate_phenopacket_store_report",
]
120 changes: 91 additions & 29 deletions src/ppktstore/stats/_ppkt_store_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re
import typing

from collections import defaultdict
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -56,6 +56,18 @@ def get_descriptive_stats(self) -> typing.Mapping[str, typing.Any]:

stats_d["alleles"] = len(unique_alleles)
stats_d["PMIDs"] = len(df["PMID"].unique())
df.groupby(['cohort', 'PMID']).size()

individuals_per_gene = list(genes_to_ppkt_d.values())
stats_d["individuals per gene (max)"] = np.max(individuals_per_gene)
stats_d["individuals per gene (min)"] = np.min(individuals_per_gene)
stats_d["individuals per gene (mean)"] = np.mean(individuals_per_gene)
stats_d["individuals per gene (median)"] = np.median(individuals_per_gene)
# stats_d["individuals per gene (n>=10)"] = len([x for x in individuals_per_gene if x >= 10])
#stats_d["individuals per gene (n>=20)"] = len([x for x in individuals_per_gene if x >= 20])
#stats_d["individuals per gene (n>=50)"] = len([x for x in individuals_per_gene if x >= 50])
# stats_d["individuals per gene (n>=100)"] = len([x for x in individuals_per_gene if x >= 100])

individuals_per_disease = list(diseases_to_ppkt_id.values())
stats_d["individuals per disease (max)"] = np.max(individuals_per_disease)
stats_d["individuals per disease (min)"] = np.min(individuals_per_disease)
Expand All @@ -71,27 +83,23 @@ def get_descriptive_stats(self) -> typing.Mapping[str, typing.Any]:
stats_d["genes associated with a single disease"] = len([x for x in genes_to_disease_d.keys() if len(genes_to_disease_d[x]) == 1])
stats_d["genes associated with two diseases"] = len([x for x in genes_to_disease_d.keys() if len(genes_to_disease_d[x]) == 2])
stats_d["genes associated with multiple diseases"] = len([x for x in genes_to_disease_d.keys() if len(genes_to_disease_d[x]) > 2])
max_gene = None
max_genes = set()
max_count = 0
for k,v in genes_to_disease_d.items():
n_diseases = len(v)
if n_diseases > max_count:
max_count = n_diseases
max_gene = k
max_msg = f"{max_gene} with {max_count} associated diseases"
stats_d["gene with maximum number of diseases"] = max_msg
individuals_per_gene = list(genes_to_ppkt_d.values())
#stats_d["individuals per gene (max)"] = np.max(individuals_per_gene)
# stats_d["individuals per gene (min)"] = np.min(individuals_per_gene)
#stats_d["individuals per gene (mean)"] = np.mean(individuals_per_gene)
#stats_d["individuals per gene (median)"] = np.median(individuals_per_gene)
# stats_d["individuals per gene (n>=10)"] = len([x for x in individuals_per_gene if x >= 10])
#stats_d["individuals per gene (n>=20)"] = len([x for x in individuals_per_gene if x >= 20])
#stats_d["individuals per gene (n>=50)"] = len([x for x in individuals_per_gene if x >= 50])
# stats_d["individuals per gene (n>=100)"] = len([x for x in individuals_per_gene if x >= 100])
all_hpo_d = PPKtStoreStats._get_all_unique_hpo_d(self._store)
stats_d["total HPO terms used"] = len(all_hpo_d)

for gene, diseases in genes_to_disease_d.items():
n_diseases = len(diseases)
if n_diseases >= max_count:
if n_diseases > max_count:
max_count = n_diseases
max_genes.clear()
max_genes.add(gene)
genes_text = ', '.join(sorted(max_genes))
max_msg = f"{genes_text} with {max_count} associated diseases"
stats_d["gene(s) with maximum number of diseases"] = max_msg
total_hpo_stats = PPKtStoreStats._get_total_and_unique_hpo_counts(self._store)
stats_d.update(total_hpo_stats)

pf_summary = PPKtStoreStats._summarize_phenotypic_features(self._store)
stats_d.update(pf_summary)
return stats_d


Expand All @@ -116,20 +124,74 @@ def get_counts_per_disease_df(self) -> pd.DataFrame:
return pd.DataFrame(items, index=None)

@staticmethod
def _get_all_unique_hpo_d(
def _get_total_and_unique_hpo_counts(
phenopacket_store: PhenopacketStore,
) -> typing.Mapping[str, str]:
"""return a dictionary with key=HPO id, value=label of all HPOs used in phenopacket-store
) -> typing.Mapping[str, float]:
"""
all_hpo_d = dict()
Return a tuple with the counts of all and distinct HPOs used in Phenopacket store.
"""
n_total = 0
n_present = 0
n_excluded = 0
unique = set()

for cohort_info in phenopacket_store.cohorts():
for pp_info in cohort_info.phenopackets:
pp = pp_info.phenopacket
for pf in pp.phenotypic_features:
all_hpo_d[pf.type.id] = pf.type.label
print(f"Got {len(all_hpo_d)} unique HPOs")
return all_hpo_d
if pf.excluded:
n_excluded += 1
else:
n_present += 1

unique.add(pf.type.id)
n_total += 1

return {
"total HPO terms used": n_total,
"present HPO terms used": n_present,
"excluded HPO terms used": n_excluded,
"unique HPO terms used": len(unique),
}

@staticmethod
def _summarize_phenotypic_features(
store: PhenopacketStore,
) -> typing.Mapping[str, float]:
cohort2present = Counter()
cohort2excluded = Counter()
cohort2total = Counter()
for cohort_info in store.cohorts():
cohort_name = cohort_info.name
for pp_info in cohort_info.phenopackets:
pp = pp_info.phenopacket
cohort2present[cohort_name] += sum(1 for pf in pp.phenotypic_features if not pf.excluded)
cohort2excluded[cohort_name] += sum(1 for pf in pp.phenotypic_features if pf.excluded)
cohort2total[cohort_name] += len(pp.phenotypic_features)

c2p = list(cohort2present.values())
c2e = list(cohort2excluded.values())
c2t = list(cohort2total.values())

# TODO: this should be per sample

return {
'present HPO terms per cohort (mean)': float(np.mean(c2p)),
'present HPO terms per cohort (median)': float(np.median(c2p)),
'present HPO terms per cohort (min)': float(np.min(c2p)),
'present HPO terms per cohort (max)': float(np.max(c2p)),

'excluded HPO terms per cohort (mean)': float(np.mean(c2e)),
'excluded HPO terms per cohort (median)': float(np.median(c2e)),
'excluded HPO terms per cohort (min)': float(np.min(c2e)),
'excluded HPO terms per cohort (max)': float(np.max(c2e)),

'total HPO terms per cohort (mean)': float(np.mean(c2t)),
'total HPO terms per cohort (median)': float(np.median(c2t)),
'total HPO terms per cohort (min)': float(np.min(c2t)),
'total HPO terms per cohort (max)': float(np.max(c2t)),
}


def check_disease_id(self):
"""
Expand Down Expand Up @@ -227,7 +289,7 @@ def get_disease_to_phenopacket_count_d(self) -> typing.Dict[str, int]:
print(f"Extracted a total of {len(disease_to_ppkt_count_d)} diseases")
return disease_to_ppkt_count_d

def _get_gene_symbol(self, ppkt: Phenopacket) -> str:
def _get_gene_symbol(self, ppkt: Phenopacket) -> typing.Optional[str]:
if len(ppkt.interpretations) == 0:
print(f"Warning: Got no interpretations for {ppkt.id}. Skipping")
return None
Expand Down
95 changes: 91 additions & 4 deletions src/ppktstore/stats/_summary.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import re
import typing
import warnings

Expand All @@ -7,14 +8,29 @@
from collections import defaultdict

from phenopackets.schema.v2.phenopackets_pb2 import Phenopacket
from phenopackets.schema.v2.core.meta_data_pb2 import MetaData
from phenopackets.schema.v2.core.base_pb2 import TimeElement, Age
from phenopackets.schema.v2.core.interpretation_pb2 import Diagnosis
from phenopackets.schema.v2.core.individual_pb2 import Sex, Individual, VitalStatus
from phenopackets.schema.v2.core.meta_data_pb2 import MetaData

from phenopackets.vrsatile.v1.vrsatile_pb2 import VariationDescriptor

from ppktstore.model import PhenopacketStore


iso_duration_pt = re.compile(
r"^P((?P<years>\d+)Y)?((?P<months>\d+)M)?((?P<days>\d+)D)?(T((?P<hours>\d+)H)?((?P<minutes>\d+)M)?((?P<seconds>\d+)S)?)?$"
)
unit_to_days = {
"years": 365.25,
"months": 30,
"days": 1,
"hours": 1 / 24,
"minutes": 1 / (24 * 60),
"seconds": 1 / (24 * 60 * 60),
}


def summarize_diseases_and_genotype(
phenopacket_store: PhenopacketStore,
) -> pd.DataFrame:
Expand Down Expand Up @@ -52,7 +68,21 @@ def summarize_diseases_and_genotype(
data["cohort"].append(cohort.name)
data["filename"].append(pp_path)

return pd.DataFrame(data)
columns = pd.Index(
[
"patient_id",
"cohort",
"disease_id",
"disease",
"gene",
"allele_1",
"allele_2",
"PMID",
"filename",
]
)

return pd.DataFrame(data, columns=columns)


def summarize_genotype_phenotype(
Expand Down Expand Up @@ -126,6 +156,32 @@ def summarize_cohort_sizes(
return pd.DataFrame(rows)


def summarize_individuals(
store: PhenopacketStore,
) -> pd.DataFrame:
data = defaultdict(list)
for cohort_info in store.cohorts():
for pp_info in cohort_info.phenopackets:
pp = pp_info.phenopacket
subject = pp.subject

data["id"].append(f"{pp.id}-{subject.id}")
data["sex"].append(Sex.Name(subject.sex))
n_days = _extract_age(subject)
data["age_in_days"].append(n_days)
data["age_in_years"].append(n_days / unit_to_days["years"])

vs = None
if subject.HasField("vital_status"):
vs = VitalStatus.Status.Name(subject.vital_status.status)
else:
vs = None
data["vital_status"].append(vs)

columns = pd.Index(["id", "sex", "age_in_days", "age_in_years", "vital_status"])
return pd.DataFrame(data=data, columns=columns)


def _get_encounter_count(
pp: Phenopacket,
) -> int:
Expand All @@ -149,7 +205,7 @@ def _get_pmid(


def _get_gene_and_alleles(
diagnosis: Diagnosis
diagnosis: Diagnosis,
) -> typing.Tuple[typing.Optional[str], typing.Sequence[str]]:
"""
Extract the gene symbol, failing that the label, for a variant from the Interpretation object.
Expand Down Expand Up @@ -185,7 +241,7 @@ def _get_gene_and_alleles(
genotype = var_desc.allelic_state
if genotype.label == "homozygous" and len(alleles) > 0:
alleles.append(alleles[0])

return gene, alleles


Expand All @@ -210,3 +266,34 @@ def _get_structural_var(
else:
raise ValueError(f"Could not find structural_type field")
return alleles


def _extract_age(
subject: Individual,
) -> float:
if subject.HasField("time_at_last_encounter"):
tale = subject.time_at_last_encounter
if tale.HasField("age"):
age = tale.age
ageperiod = age.iso8601duration
if ageperiod is not None or ageperiod != "":
return _parse_to_days(ageperiod)

return float("nan")


def _parse_to_days(
age_period: str,
) -> float:
match = iso_duration_pt.match(age_period)
if match is None:
return float("nan")
else:
n_days = 0.0

for unit, multiplier in unit_to_days.items():
val = match.group(unit)
if val is not None:
n_days += float(val) * multiplier

return n_days
59 changes: 59 additions & 0 deletions src/ppktstore/stats/_test_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import typing

import pytest


from ._summary import iso_duration_pt, _parse_to_days


@pytest.mark.parametrize(
"value, years, months, days, hours, minutes, seconds",
[
("P3Y6M4DT12H30M5S", "3", "6", "4", "12", "30", "5"),
("P3Y6M4D", "3", "6", "4", None, None, None),
("P3Y", "3", None, None, None, None, None),
("P6M", None, "6", None, None, None, None),
("P4D", None, None, "4", None, None, None),
],
)
def test_iso_duration_pt__ok_values(
value: str,
years: typing.Optional[str],
months: typing.Optional[str],
days: typing.Optional[str],
hours: typing.Optional[str],
minutes: typing.Optional[str],
seconds: typing.Optional[str],
):
match = iso_duration_pt.match(value)

assert match is not None

assert match.group("years") == years
assert match.group("months") == months
assert match.group("days") == days
assert match.group("hours") == hours
assert match.group("minutes") == minutes
assert match.group("seconds") == seconds


@pytest.mark.parametrize(
"value, expected",
[
("P3Y6M4DT12H30M5S", 1280.27089,),
("P3Y6M4D", 1279.75,),
("P3Y", 1095.75,),
("P3Y4M", 1215.75,),
("P6M", 180.,),
("P4D", 4.,),
("PT12H", .5,),
("PT1S", .00001157407,),
],
)
def test__parse_to_days(
value: str,
expected: float,
):
actual = _parse_to_days(value)

assert actual == pytest.approx(expected)
Loading