Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend api #14

Merged
merged 6 commits into from
Apr 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 24 additions & 5 deletions backend/api/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import os

from preprocessing.visualization import generate_chords
from preprocessing.merge_clean import merge_modalities, clean_extra_columns
from preprocessing.studypicker import rank_cohorts

from contextlib import asynccontextmanager

Expand Down Expand Up @@ -63,19 +65,30 @@ def get_current_version():

@app.get("/cdm", tags=["info"])
def get_cdm():
files = [file for file in os.listdir("./cdm") if file.endswith(".csv")]
dfs = [pd.read_csv(os.path.join("./cdm", file), keep_default_na=False) for file in files]
cdm = pd.concat(dfs, ignore_index=True)
cdm = merge_modalities()
return cdm.to_dict()


@app.get("/cdm/cohorts", tags=["info"])
def get_cohorts():
cdm = merge_modalities()
cdm = clean_extra_columns(cdm)
return {idx: cohort for idx, cohort in enumerate(cdm.columns)}


@app.get("/cdm/features", tags=["info"])
def get_features():
features = merge_modalities(usecols=["Feature"])
return features.to_dict()


@app.get("/cdm/modalities", tags=["info"])
def get_available_modalities():
def get_modalities():
files = [file.replace(".csv", "") for file in os.listdir("./cdm") if file.endswith(".csv")]
return {idx: file for idx, file in enumerate(files)}


@app.get("/cdm/{modality}", tags=["search"])
@app.get("/cdm/modalities/{modality}", tags=["search"])
def get_modality(modality: str):
mappings = pd.read_csv(f"{'./cdm'}/{modality}.csv", keep_default_na=False)
return mappings.to_dict()
Expand All @@ -85,3 +98,9 @@ def get_modality(modality: str):
def get_chords(modality: str, cohorts: list[str]):
chords, decoder = generate_chords(modality, cohorts)
return chords, decoder


@app.post("/studypicker/rank", tags=["studypicker"])
def get_ranked_cohorts(features: list[str]):
ranked_cohorts = rank_cohorts(features)
return ranked_cohorts.to_dict()
49 changes: 49 additions & 0 deletions backend/preprocessing/merge_clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import os
import pandas as pd


def merge_modalities(folder: str="./cdm", usecols: None | list[str] = None) -> pd.DataFrame:
"""Merges all the modalities to create PASSIONATE CDM.

Args:
folder (str, optional): Path to folder containing the modalities. Defaults to "./cdm".
usecols (None | list[str], optional): Columns to use. Defaults to None.

Raises:
FileNotFoundError: The folder does not exist
FileNotFoundError: The folder is empty
ValueError: usecols list cannot be empty

Returns:
cdm (pd.DataFrame): PASSIONATE CDM containing all the modalities
"""
# Check if the folder exists
if not os.path.exists(folder):
raise FileNotFoundError(f"the folder '{folder}' does not exist.")
# Check if the folder is empty
if not bool(os.listdir(folder)):
raise FileNotFoundError(f"the folder '{folder}' is empty.")
# Check if the usecols is not None and not an empty list
if usecols is not None and not usecols:
raise ValueError("The 'usecols' list cannot be empty. Please specify columns to use")
files = [file for file in os.listdir(folder) if file.endswith(".csv")]
dfs = [pd.read_csv(os.path.join(folder, file), keep_default_na=False, usecols=usecols) for file in files]
cdm = pd.concat(dfs, ignore_index=True)
cdm.replace({"No total score.": ""}, inplace=True)
return cdm


def clean_extra_columns(df: pd.DataFrame, extra_columns: list[str]=["CURIE", "Definition", "Synonyms", "OMOP"]):
"""Cleans additional information from a given mapping data frame.

Args:
df (pd.DataFrame): Mappings data frame
extra_columns (list[str], optional): List of columns to drop. Defaults to ["CURIE", "Definition", "Synonyms", "OMOP"].

Returns:
df (pd.DataFrame): Mappings data cleaned from additional information
"""
for column in extra_columns:
if column in df.columns:
df.drop(column, axis=1, inplace=True)
return df
62 changes: 62 additions & 0 deletions backend/preprocessing/studypicker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import os

import pandas as pd
import numpy as np

from preprocessing.merge_clean import merge_modalities, clean_extra_columns

def rank_cohorts(features: list[str], folder: str="./cdm") -> pd.DataFrame:
"""Ranks cohorts based on the availability of requested features.

Args:
features (list[str]): A list of features user interested in
folder (str, optional): Path to folder containing modalities. Defaults to "./cdm".

Raises:
FileNotFoundError: The folder does not exist
FileNotFoundError: The folder is empty
ValueError: features list cannot be empty

Returns:
ranked_cohorts (pd.DataFrame): A dataframe showcasing ranked cohorts
"""
# Check if the folder exists
if not os.path.exists(folder):
raise FileNotFoundError(f"the folder '{folder}' does not exist.")
# Check if the folder is empty
if not bool(os.listdir(folder)):
raise FileNotFoundError(f"the folder '{folder}' is empty.")
# Check if the features list is empty
if not features:
raise ValueError("The 'features' list cannot be empty")
total_features = len(features)
# Initialize an empty data frame
ranked_cohorts = pd.DataFrame(columns=["Cohort (ranked)", "Successfully found", "Missing features"])
# Merge the modalities together
cdm = merge_modalities(folder=folder)
# Use NaN for missing values
cdm.replace({"": np.nan}, inplace=True)
# Set Feature column as the index and drop non-cohort columns
cdm.set_index("Feature", inplace=True)
cdm = clean_extra_columns(cdm)
# Filter the CDM based on requested features
mappings = cdm.loc[features, :]
for column in mappings.columns:
# If the column is empty continue with the next iteration
if mappings[column].isna().all():
continue
# Store found and missing features for each cohort
found_features = mappings[column].notna().sum()
missing_features = mappings[mappings[column].isna()].index.tolist()
# Concatenate missing features with a comma
missing_features = ", ".join(map(str, missing_features))
# Add derived information to the ranked_cohorts data frame
ranked_cohorts.loc[len(ranked_cohorts.index)] = [column, found_features, missing_features]
# Sort values based on the number of successfully found features
ranked_cohorts.sort_values(by="Successfully found", ascending=False, inplace=True)
# Calculate the percentage of features found
percentage_found = ((ranked_cohorts['Successfully found'] / total_features) * 100).round(2)
# Format the "Successfully found" column so that it displays the data in
# "(found_features)/(total_features) (percentage_found)" format
ranked_cohorts['Successfully found'] = ranked_cohorts['Successfully found'].astype(str) + '/' + str(total_features) + ' (' + percentage_found.astype(str) + '%)'
return ranked_cohorts
19 changes: 18 additions & 1 deletion backend/preprocessing/visualization.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import os

import pandas as pd


def generate_chords(modality: str, cohorts: list[str], folder='./cdm'):
def generate_chords(modality: str, cohorts: list[str], folder: str='./cdm'):
"""Generate linkage information for cohorts in a modality.

The variables of each cohort will be encoded in numbers consecutively
Expand All @@ -11,11 +13,26 @@ def generate_chords(modality: str, cohorts: list[str], folder='./cdm'):
Args:
modality (str): Name of the modality
cohorts (list[str]): Cohorts to be included in the mappings
folder (str, optional): Path to the folder containing the modality

Raises:
FileNotFoundError: The folder does not exist
FileNotFoundError: The folder is empty
ValueError: cohorts list cannot be empty

Returns:
chords (dict[Hashable, Any]): A dictionary containing linkage information
decoder (dict[Hashable, Any]): A dictionary to decode the numbers in the linkage information
"""
# Check if the folder exists
if not os.path.exists(folder):
raise FileNotFoundError(f"the folder '{folder}' does not exist.")
# Check if the folder is empty
if not bool(os.listdir(folder)):
raise FileNotFoundError(f"the folder '{folder}' is empty.")
# Check if the cohorts list is empty
if not cohorts:
raise ValueError("The 'cohorts' list cannot be empty.")
# Initialize a dictionary to decode the numbers of variables later on, and save used cohorts
decoder = {}
decoder["cohorts"] = cohorts
Expand Down
4 changes: 4 additions & 0 deletions backend/tests/resources/modalities/test_modality1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Feature,cohort_x,cohort_y,cohort_z,cohort_d,Synonyms
age,age_onset,age,dg_25234,Age,Age
height,,,dg_45236,Height,Height
sex,biological_sex,gender,dg_12312,Sex,Biological Sex
4 changes: 4 additions & 0 deletions backend/tests/resources/modalities/test_modality2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Feature,cohort_x,cohort_y,cohort_z,cohort_d,Synonyms
mmse,mmse_total,mmse_score,cl_658275,MMSE,MMSE
gds_sf,,,cl_646821,GDS_SF,GDS_SF
apoe,,,,,APOE
2 changes: 1 addition & 1 deletion backend/tests/resources/test_mappings.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
feature,cohort_x,cohort_y,cohort_z,cohort_d
Feature,cohort_x,cohort_y,cohort_z,cohort_d
age,age_x,age_y,,
height,height_x,height_y,height_z,
weight,weight_x,,weight_z,
Expand Down
38 changes: 38 additions & 0 deletions backend/tests/test_merge_clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from preprocessing.merge_clean import merge_modalities, clean_extra_columns

def test_merge_modalities():
folder="./backend/tests/resources/modalities"
usecols = ["Feature", "cohort_x", "cohort_d", "Synonyms"]
cdm = merge_modalities(folder=folder, usecols=usecols)

cdm_result = {
"Feature": {0: "age", 1: "height", 2: "sex", 3: "mmse", 4: "gds_sf", 5: "apoe"},
"cohort_x": {0: "age_onset", 1: "", 2: "biological_sex", 3: "mmse_total", 4: "", 5: ""},
"cohort_d": {0: "Age", 1: "Height", 2: "Sex", 3: "MMSE", 4: "GDS_SF", 5: ""},
"Synonyms": {0: "Age", 1: "Height", 2: "Biological Sex", 3: "MMSE", 4: "GDS_SF", 5: "APOE"}
}

# Assert the shape of the DataFrame
assert cdm.shape == (6, 4)
# Assert the column names
assert cdm.columns.tolist() == ["Feature", "cohort_x", "cohort_d", "Synonyms"]
# Assert the values in the DataFrame
assert cdm_result == cdm.to_dict()

def test_clean_extra_columns():
folder="./backend/tests/resources/modalities"
usecols = ["Feature", "cohort_x", "cohort_d", "Synonyms"]
cdm = merge_modalities(folder=folder, usecols=usecols)
cdm = clean_extra_columns(cdm, extra_columns=["Feature", "Synonyms"])

cdm_result = {
"cohort_x": {0: "age_onset", 1: "", 2: "biological_sex", 3: "mmse_total", 4: "", 5: ""},
"cohort_d": {0: "Age", 1: "Height", 2: "Sex", 3: "MMSE", 4: "GDS_SF", 5: ""},
}

# Assert the shape of the DataFrame
assert cdm.shape == (6, 2)
# Assert the column names
assert cdm.columns.tolist() == ["cohort_x", "cohort_d"]
# Assert the values in the DataFrame
assert cdm_result == cdm.to_dict()
23 changes: 23 additions & 0 deletions backend/tests/test_studypicker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from preprocessing.studypicker import rank_cohorts


def test_rank_cohorts():
features = ["age", "height", "bmi"]
ranked_cohorts = rank_cohorts(features=features, folder="./backend/tests/resources")

rank_cohorts_result = {
"Cohort (ranked)": {0: "cohort_x", 1: "cohort_y", 2: "cohort_z"},
"Successfully found": {0: "2/3 (66.67%)", 1: "2/3 (66.67%)", 2: "1/3 (33.33%)"},
"Missing features": {0: "bmi", 1: "bmi", 2: "age, bmi"},
}

# Assert the shape of the DataFrame
assert ranked_cohorts.shape == (3, 3)
# Assert the column data types
assert ranked_cohorts["Cohort (ranked)"].dtype == "object"
assert ranked_cohorts["Successfully found"].dtype == "object"
assert ranked_cohorts["Missing features"].dtype == "object"
# Assert the values in the DataFrame
assert rank_cohorts_result == ranked_cohorts.to_dict()
# Check if the DataFrame is empty when no cohorts are found
assert not ranked_cohorts.empty or features == []
5 changes: 3 additions & 2 deletions backend/tests/test_visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ def test_generate_chords():
chords, decoder = generate_chords(modality, cohorts, folder="./backend/tests/resources")

chords_result = {
"link_id": ["link_0", "link_0", "link_1", "link_1", "link_1", "link_2", "link_2",],
"cohort": ["cohort_x", "cohort_y", "cohort_x", "cohort_y", "cohort_z", "cohort_x", "cohort_z",],
"link_id": ["link_0", "link_0", "link_1", "link_1", "link_1", "link_2", "link_2"],
"cohort": ["cohort_x", "cohort_y", "cohort_x", "cohort_y", "cohort_z", "cohort_x", "cohort_z"],
"start": [1, 1, 2, 2, 1, 3, 2],
"end": [1, 1, 2, 2, 1, 3, 2],
}
Expand All @@ -26,5 +26,6 @@ def test_generate_chords():
"cohort_z": {1: "height_z", 2: "weight_z"},
}

# Assert the output dictionaries match the expected dictionaries
assert decoder == decoder_result
assert chords == chords_result