Skip to content
This repository has been archived by the owner on Apr 19, 2023. It is now read-only.

Commit

Permalink
Allow to skip for a given sample the sample-based metadata annotation…
Browse files Browse the repository at this point in the history
… if not present in metadata under certain circumstance i.e.: low similarity score compare to all samples defined in this metadata file.
  • Loading branch information
dweemx committed Sep 17, 2020
1 parent edcda97 commit 23e188c
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 27 deletions.
1 change: 1 addition & 0 deletions src/utils/bin/sc_file_concatenator.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
)
adata.var.index = adata.var.index.astype(str)
adata = adata[:, np.sort(adata.var.index)]
print(f"Total number of cells: {adata.obs.shape[0]}, genes: {adata.var.shape[0]}.")
else:
raise Exception("VSN ERROR: Concatenation of .{} files is not implemented.".format(args.format))

Expand Down
65 changes: 38 additions & 27 deletions src/utils/bin/sc_h5ad_annotate_by_sample_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
import pandas as pd
import scanpy as sc
from difflib import SequenceMatcher

parser = argparse.ArgumentParser(description='')

Expand Down Expand Up @@ -106,38 +107,48 @@
sep="\t"
)

sample_info = metadata[metadata[args.sample_column_name] == SAMPLE_NAME]

if len(sample_info) == 0:
raise Exception(f"VSN ERROR: The metadata .tsv file does not contain sample ID '{SAMPLE_NAME}'.")
elif args.method == "sample" and len(sample_info) > 1:
raise Exception(f"VSN ERROR: The metadata .tsv file contains duplicate entries with the sample ID '{SAMPLE_NAME}'. Fix your metadata or use the 'sample+' method.")

if args.method == "sample":
for (column_name, column_data) in sample_info.iteritems():
adata.obs[column_name] = column_data.values[0]
elif args.method == "sample+":

if args.adata_comp_index_column_names is None or args.metadata_comp_index_column_names is None:
raise Exception("VSN ERROR: compIndexColumnNames param is missing in the sample_annotate config.")
def similar(a, b):
return SequenceMatcher(None, a, b).ratio()

new_obs = pd.merge(
adata.obs,
sample_info,
left_on=["sample_id"] + args.adata_comp_index_column_names,
right_on=[args.sample_column_name] + args.metadata_comp_index_column_names
)

if new_obs.isnull().values.any():
raise Exception("VSN ERROR: Merged adata.obs not complete, some NaN values detected.")
sample_info = metadata[metadata[args.sample_column_name] == SAMPLE_NAME]
sample_scores = [similar(index_entry, SAMPLE_NAME) for index_entry in metadata[args.sample_column_name]]

# Update the obs slot of the AnnData
adata.obs = new_obs
if all(sample_score < 0.5 for sample_score in sample_scores):
# Skip annotation for this sample
print(f"Skipping annotation for {SAMPLE_NAME}.")
else:
raise Exception(f"VSN ERROR: Unrecognized method {args.method}.")

if args.annotation_column_names is not None and len(args.annotation_column_names) > 0:
adata.obs = adata.obs[args.annotation_column_names]
if len(sample_info) == 0:
raise Exception(f"VSN ERROR: The metadata .tsv file does not contain sample ID '{SAMPLE_NAME}'.")
elif args.method == "sample" and len(sample_info) > 1:
raise Exception(f"VSN ERROR: The metadata .tsv file contains duplicate entries with the sample ID '{SAMPLE_NAME}'. Fix your metadata or use the 'sample+' method.")

if args.method == "sample":
for (column_name, column_data) in sample_info.iteritems():
adata.obs[column_name] = column_data.values[0]
elif args.method == "sample+":

if args.adata_comp_index_column_names is None or args.metadata_comp_index_column_names is None:
raise Exception("VSN ERROR: compIndexColumnNames param is missing in the sample_annotate config.")

new_obs = pd.merge(
adata.obs,
sample_info,
left_on=["sample_id"] + args.adata_comp_index_column_names,
right_on=[args.sample_column_name] + args.metadata_comp_index_column_names
)

if new_obs.isnull().values.any():
raise Exception("VSN ERROR: Merged adata.obs not complete, some NaN values detected.")

# Update the obs slot of the AnnData
adata.obs = new_obs
else:
raise Exception(f"VSN ERROR: Unrecognized method {args.method}.")

if args.annotation_column_names is not None and len(args.annotation_column_names) > 0:
adata.obs = adata.obs[args.annotation_column_names]

# I/O
adata.write_h5ad("{}.h5ad".format(FILE_PATH_OUT_BASENAME))

0 comments on commit 23e188c

Please sign in to comment.