Skip to content

Commit

Permalink
Merge pull request #64 from erikwolfsohn/seqsender_gisaid_dev
Browse files Browse the repository at this point in the history
GISAID covCLI bugfixes & BioSample/SRA modifications
  • Loading branch information
dthoward96 committed Sep 6, 2024
2 parents 60da176 + a4fff17 commit 971bfba
Show file tree
Hide file tree
Showing 9 changed files with 136 additions and 39 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,5 @@ docker-compose-*.yaml

# ignore folders
**/.Rproj.user
**/test_data/*
**/gisaid_cli/*
2 changes: 1 addition & 1 deletion argument_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def args_parser():
required=True)
file_parser.add_argument("--fasta_file",
help="Fasta file used to generate submission files; fasta header should match the column 'sequence_name' stored in your metadata. Input either full file path or if just file name it must be stored at '<submission_dir>/<submission_name>/<fasta_file>'.",
required=True)
default = None)
file_parser.add_argument("--table2asn",
help="Perform a table2asn submission instead of GenBank FTP submission for organism choices 'FLU' or 'COV'.",
required=False,
Expand Down
15 changes: 10 additions & 5 deletions biosample_sra_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ def create_manual_submission_files(database: str, submission_dir: str, metadata:
column_ordered = ["sample_name","library_ID"]
prefix = "sra-"
# Create SRA specific fields
metadata["sra-title"] = config_dict["Description"]["Title"]
if 'sra-title' not in metadata.columns or metadata['sra-title'].isnull().any() or (metadata['sra-title'] == '').any():
metadata["sra-title"] = config_dict["Description"]["Title"]
filename_cols = [col for col in metadata.columns.tolist() if re.match("sra-file_[1-9]\d*", col)]
# Correct index for filename column
for col in filename_cols:
Expand Down Expand Up @@ -142,8 +143,10 @@ def create_submission_xml(organism: str, database: str, submission_name: str, co
# Remove columns with bs-prefix that are not attributes
biosample_cols = [col for col in database_df.columns.tolist() if (col.startswith('bs-')) and (col not in ["bs-sample_name", "bs-package", "bs-description"])]
for col in biosample_cols:
attribute = etree.SubElement(attributes, "Attribute", attribute_name=col.replace("bs-",""))
attribute.text = row[col]
attribute_value = row[col]
if pd.notnull(attribute_value) and attribute_value.strip() != "":
attribute = etree.SubElement(attributes, "Attribute", attribute_name=col.replace("bs-",""))
attribute.text = row[col]
# Add collection date to Attributes
attribute = etree.SubElement(attributes, "Attribute", attribute_name="collection_date")
attribute.text = row["collection_date"]
Expand Down Expand Up @@ -176,8 +179,10 @@ def create_submission_xml(organism: str, database: str, submission_name: str, co
# Remove columns with sra- prefix that are not attributes
sra_cols = [col for col in database_df.columns.tolist() if col.startswith('sra-') and not re.match("(sra-sample_name|sra-file_location|sra-file_\d*)", col)]
for col in sra_cols:
attribute = etree.SubElement(addfiles, "Attribute", name=col.replace("sra-",""))
attribute.text = row[col]
attribute_value = row[col]
if pd.notnull(attribute_value) and attribute_value.strip() != "":
attribute = etree.SubElement(addfiles, "Attribute", name=col.replace("sra-",""))
attribute.text = row[col]
if pd.notnull(row["bioproject"]) and row["bioproject"].strip() != "":
attribute_ref_id = etree.SubElement(addfiles, "AttributeRefId", name="BioProject")
refid = etree.SubElement(attribute_ref_id, "RefId")
Expand Down
74 changes: 58 additions & 16 deletions gisaid_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,22 @@ def create_gisaid_files(organism: str, database: str, submission_name: str, subm
# Get column names for gisaid submission only
gisaid_df = metadata.filter(regex=GISAID_REGEX).copy()
gisaid_df.columns = gisaid_df.columns.str.replace("gs-","").str.strip()
#Add required GISAID fields
# Add required GISAID fields
# covCLI returns an error when authors or collection_date are capitalized
if organism in ["COV", "POX", "ARBO"]:
gisaid_df = gisaid_df.rename(columns = {"sample_name": "virus_name", "authors": "Authors", "collection_date": "Collection_Date"})
gisaid_df = gisaid_df.rename(columns = {"sample_name": "virus_name"})
if organism in ["POX", "ARBO"]:
gisaid_df = gisaid_df.rename(columns = {"authors": "Authors", "collection_date": "Collection_Date"})
if organism == "COV":
prefix_name = "covv_"
else:
prefix_name = organism.lower() + "_"
gisaid_df = gisaid_df.add_prefix(prefix_name)
gisaid_df["submitter"] = config_dict["Username"]
gisaid_df["fn"] = ""
# Copy FASTA headers from sequence_name column to fn column
gisaid_df["fn"] = gisaid_df["covv_sequence_name"]
# Dropping the original column so it doesn't cause issues later.
gisaid_df.drop(columns=['covv_sequence_name'], inplace=True)
first_cols = ["submitter", "fn", (prefix_name + "virus_name")]
elif "FLU" in organism:
gisaid_df = gisaid_df.rename(columns = {"authors": "Authors", "collection_date": "Collection_Date"})
Expand All @@ -51,6 +57,9 @@ def create_gisaid_files(organism: str, database: str, submission_name: str, subm
# Restructure column order
last_cols = [col for col in gisaid_df.columns if col not in first_cols]
gisaid_df = gisaid_df[first_cols + last_cols]
# If a submission fails and a new one is initiated without cleaning up metadata.csv/orig_metadata.csv, the column prefix doubles up.
# This works for now, might set up some housekeeping in the future
gisaid_df.columns = gisaid_df.columns.str.replace('^covv_covv_', 'covv_', regex=True)
# Create submission files
file_handler.save_csv(df=gisaid_df, file_path=submission_dir, file_name="metadata.csv")
shutil.copy(os.path.join(submission_dir, "metadata.csv"), os.path.join(submission_dir, "orig_metadata.csv"))
Expand All @@ -68,21 +77,50 @@ def process_gisaid_log(log_file: str, submission_dir: str) -> pd.DataFrame:
while line:
# If accession generated record it
# Pattern options: "msg:": "<Sample Name>; <EPI_ISL/EPI_ID>_<Accession Numbers>" OR <epi_isl_id/epi_id>: <Sample Name>; <EPI_ISL/EPI_ID>_<Accession Numbers>
if re.match("(?i)(\W|^)(\"msg\":\s*\"\S+.*;\s*(EPI_ISL|EPI_ID)_\d{6,}\"|(epi_id|epi_isl_id):\s*\S.*;\s*(EPI_ISL_|EPI)\d+)(\W|$)", line):
gisaid_string = re.sub("(\W|^)\"msg\":\s*\"", "", line)
gisaid_string_list: List[str] = gisaid_string.strip().replace("\"", "").split(";")
sample_name = re.sub("(epi_isl_id|epi_id):\s*", "", gisaid_string_list[0].strip())
# Changed re.match to re.search to return tokens that aren't at the beginning. Changed the regex to capture an arbitrary amount of numbers after EPI_
if re.search("(?i)(\W|^)(\"msg\":\s*\"\S+.*;\s*(EPI_ISL|EPI_ID)_\d*\"|(epi_id|epi_isl_id):\s*\S.*;\s*(EPI_ISL_|EPI)\d+)(\W|$)", line):
gisaid_string = re.findall(r'(?:[a-zA-Z0-9_-]+(?:/[a-zA-Z0-9_-]+)+|EPI_\w*)', line)
gisaid_string = ' '.join(gisaid_string)
gisaid_string_list: List[str] = gisaid_string.split(' ')
sample_name = gisaid_string_list[0].strip()
accession = gisaid_string_list[1].strip()
if re.match("EPI_ISL_\d+", accession):
gisaid_isolate_log.append({"gs-sample_name":sample_name, "gisaid_accession_epi_isl_id":accession})
elif re.match("EPI\d+", accession):
gisaid_segment_log.append({"gs-segment_name":sample_name, "gisaid_accession_epi_id":accession})
# handling if submitting samples that have already been registered in GISAID
elif re.search(r'"code":\s*"validation_error".*?already exists;\s*existing_virus_name:', line):
sample_name = re.search(r"(hCoV[^;]+);", line)
sample_name = sample_name.group(1) if sample_name else None
if re.search(r"\['(EPI_ISL_\d+)'\]", line):
accession = re.search(r"\['(EPI_ISL_\d+)'\]", line)
accession = accession.group(1)
gisaid_isolate_log.append({"gs-sample_name":sample_name, "gisaid_accession_epi_isl_id":accession})
elif re.search(r"\['(EPI_\d+)'\]", line):
accession = re.search(r"\['(EPI_\d+)'\]", line)
accession = accession.group(1)
gisaid_segment_log.append({"gs-segment_name":sample_name, "gisaid_accession_epi_id":accession})
else:
print("Finished reading GISAID log. If workflow has failed here, it's likely no GISAID IDs were returned. Check results in GISAID upload log.")
line = file.readline().strip()
gisaid_isolate_df = pd.DataFrame(gisaid_isolate_log)
gisaid_segment_df = pd.DataFrame(gisaid_segment_log)
upload_log.update_submission_status_csv(submission_dir=submission_dir.replace("/GISAID", "/"), update_database="GISAID", update_df=gisaid_isolate_df)
upload_log.update_submission_status_csv(submission_dir=submission_dir.replace("/GISAID", "/"), update_database="GISAID", update_df=gisaid_segment_df)
gisaid_isolate_df = gisaid_isolate_df[~gisaid_isolate_df["gisaid_accession_epi_isl_id"].str.contains("EPI_ISL_\d{6,}", regex = True, na = False)].copy()

# Avoid passing an empty data frame to update_submission_status_csv since it was causing issues.
# Output types of GISAID entities found
if not gisaid_isolate_df.empty and not gisaid_segment_df.empty:
print("GISAID isolates and GISAID segments found.")
upload_log.update_submission_status_csv(submission_dir=submission_dir.replace("/GISAID", "/"), update_database="GISAID", update_df=gisaid_isolate_df)
upload_log.update_submission_status_csv(submission_dir=submission_dir.replace("/GISAID", "/"), update_database="GISAID", update_df=gisaid_segment_df)
elif not gisaid_isolate_df.empty:
print("GISAID isolates found.")
upload_log.update_submission_status_csv(submission_dir=submission_dir.replace("/GISAID", "/"), update_database="GISAID", update_df=gisaid_isolate_df)
elif not gisaid_segment_df.empty:
print("GISAID segments found.")
upload_log.update_submission_status_csv(submission_dir=submission_dir.replace("/GISAID", "/"), update_database="GISAID", update_df=gisaid_segment_df)
else:
print("Warning: no GISAID isolates or segments found")
gisaid_isolate_df = gisaid_isolate_df[~gisaid_isolate_df["gisaid_accession_epi_isl_id"].str.contains("EPI_ISL_\d*", regex = True, na = False)].copy()
return gisaid_isolate_df[["gs-sample_name"]]

# Submit to GISAID
Expand Down Expand Up @@ -120,6 +158,7 @@ def submit_gisaid(organism: str, submission_dir: str, submission_name: str, conf
while not os.path.exists(log_file):
time.sleep(10)
# Check submission log to see if all samples are uploaded successfully
# Check if not_submitted_df has contents before trying to process it
not_submitted_df = process_gisaid_log(log_file=log_file, submission_dir=submission_dir)
# If submission completed, no more attempts
if not_submitted_df.empty:
Expand All @@ -132,10 +171,13 @@ def submit_gisaid(organism: str, submission_dir: str, submission_name: str, conf
if "FLU" in organism:
column_name = "Isolate_Name"
elif "COV" in organism:
column_name = "virus_name"
column_name = "covv_virus_name"
metadata_df = metadata_df.merge(not_submitted_df, how="inner", left_on=column_name, right_on="gs-sample_name")
fasta_names = metadata_df["gs-sequence_name"].tolist()
metadata_df = metadata_df.drop(columns=["gs-sample_name", "gs-sequence_name"])
fasta_names = metadata_df["sequence_name"].tolist()
if "sequence_name" in metadata_df.columns:
metadata_df.rename(columns={"sequence_name": "fn"}, inplace=True)
metadata_df = metadata_df.drop(columns=["gs-sample_name"])
metadata_df = metadata_df.columns.str.replace('^covv_covv_', 'covv_', regex=True)
metadata_df.to_csv(orig_metadata, header = True, index = False)
fasta_dict = []
with open(orig_fasta, "r") as fsa:
Expand Down Expand Up @@ -163,16 +205,16 @@ def update_gisaid_files(organism: str, submission_dir: str, submission_status_fi
orig_fasta = os.path.join(submission_dir, "orig_sequence.fsa")
# Filter out genbank that has accession number
genbank_status_df = status_df[status_df["genbank-status"].str.contains("processed-ok", na=False)].copy()
gisaid_status_df = genbank_status_df[["gs-sample_name", "gs-sequence_name"]]
gisaid_status_df = genbank_status_df[["gs-sample_name", "sequence_name"]]
# Add required gisaid fields
metadata_df = pd.read_csv(metadata, header = 0, dtype = str, engine = "python", encoding="utf-8", index_col=False)
if "FLU" in organism:
column_name = "Isolate_Name"
elif "COV" in organism:
column_name = "virus_name"
metadata_df = metadata_df.merge(gisaid_status_df, how="inner", left_on=column_name, right_on="gs-sample_name")
fasta_names = metadata_df["gs-sequence_name"].tolist()
metadata_df = metadata_df.drop(columns=["gs-sample_name", "gs-sequence_name"])
fasta_names = metadata_df["sequence_name"].tolist()
metadata_df = metadata_df.drop(columns=["gs-sample_name", "sequence_name"])
metadata_df.to_csv(orig_metadata, header = True, index = False)
fasta_dict = []
with open(orig_fasta, "r") as fsa:
Expand Down
3 changes: 0 additions & 3 deletions ncbi_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,6 @@ def submit_ncbi(database: str, submission_name: str, submission_dir: str, config
tools.check_credentials(config_dict=config_dict, database="NCBI")
# Submit sequences to NCBI via FTP Server
print(f"Uploading sample files to NCBI-{database}, as a '{submission_type}' submission. If this is not intended, interrupt immediately.", file=sys.stdout)
if submission_type != "TEST":
print("Error: submission_type " + str(submission_type))
sys.exit(1)
time.sleep(5)
try:
# Login into NCBI FTP Server
Expand Down
5 changes: 5 additions & 0 deletions seqsender.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@

from settings import VERSION

import sys

# Define current time
STARTTIME = datetime.now()

Expand Down Expand Up @@ -60,6 +62,9 @@ def prep(database: List[str], organism: str, submission_dir: str, submission_nam
file_dict[file_type] = updated_path # type: ignore
# load config file
config_dict = tools.get_config(config_file=file_dict["config_file"], database=database)
# warn if user is submitting biosample & sra together with Link_Sample_Between_NCBI_Databases off
if not config_dict["NCBI"]["Link_Sample_Between_NCBI_Databases"] and 'SRA' in database and 'BIOSAMPLE' in database:
print("Warning: You are submitting to BioSample and SRA together, but Link_Sample_Between_NCBI_Databases is off. If you did not intend to do this, SRA submission is likely to fail.\nIf this was unintentional, cancel the submission and set 'Link_Sample_Between_NCBI_Databases: on' in your config yaml.")
# load metadata file
metadata = tools.get_metadata(database=database, organism=organism, metadata_file=file_dict["metadata_file"], config_dict=config_dict, skip_validation=skip_validation)
# Load fasta file into metadata
Expand Down
8 changes: 5 additions & 3 deletions settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,14 @@
# Genbank metadata regex
GENBANK_REGEX = "^gb-sample_name$"

# Also added sequence_name to these Genbank entries since I needed it for GISAID, but haven't tested the GenBank workflow yet
# GenBank source file metadata regex
GENBANK_REGEX_SRC = "^gb-sample_name$|^src-|^bioproject$|^organism$|^collection_date$"
GENBANK_REGEX_SRC = "^sequence_name$|^gb-sample_name$|^src-|^bioproject$|^organism$|^collection_date$"

# GenBank comment file metadata regex
GENBANK_REGEX_CMT = "^gb-sample_name$|^cmt-|^organism$|^collection_date$"
GENBANK_REGEX_CMT = "^sequence_name$|^gb-sample_name$|^cmt-|^organism$|^collection_date$"

##### GISAID settings #####
# GISAID metadata regex
GISAID_REGEX = "^gs-|^collection_date$|^authors"
# Added ^sequence_name$ here - it seemed to be getting filtered out too early and causing issues
GISAID_REGEX = "^gs-|^sequence_name$|^collection_date$|^authors$"
16 changes: 15 additions & 1 deletion tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from datetime import datetime, timedelta
from cerberus import Validator
import re
import shutil

from config.seqsender.seqsender_schema import schema as seqsender_schema
from settings import SCHEMA_EXCLUSIONS, BIOSAMPLE_REGEX, SRA_REGEX, GISAID_REGEX, GENBANK_REGEX, GENBANK_REGEX_CMT, GENBANK_REGEX_SRC
Expand Down Expand Up @@ -132,8 +133,21 @@ def parse_hold_date(config_dict: Dict[str, Any]):
def get_metadata(database: List[str], organism: str, metadata_file: str, config_dict: Dict[str, Any], skip_validation: bool = False) -> pd.DataFrame:
# Read in metadata file
metadata = file_handler.load_csv(metadata_file)
# Adding some default handling for bs-description here - it's not mandatory for biosample submission, but is required in the workflow
if "BIOSAMPLE" in database:
if 'bs-description' not in metadata.columns:
# Create 'bs-description' by joining 'organism', 'bs-host', and 'bs-host_disease' columns
metadata['bs-description'] = metadata[['organism', 'bs-host', 'bs-host_disease']].apply(
lambda row: ' '.join(row.dropna().astype(str)), axis=1
)
# If there are empty biosample or sra columns in the metadata sheet, assume they are unused optional columns and drop them before pandera attempts to validate
columns_to_drop = [col for col in metadata.columns if (col.startswith('bs-') or col.startswith('sra-')) and metadata[col].replace('', pd.NA).apply(lambda x: x.strip() if isinstance(x, str) else x).isna().all()]
if columns_to_drop:
metadata = metadata.drop(columns=columns_to_drop)
shutil.copy(metadata_file, metadata_file + ".bak")
file_handler.save_csv(metadata, metadata_file)
# Update seqsender base schema to include needed checks
if "BioSample" in database or "SRA" in database:
if "BIOSAMPLE" in database or "SRA" in database:
seqsender_schema.update_columns({"bioproject":{"checks":Check.str_matches(r"^(?!\s*$).+"),"nullable":False,"required":True}})
biosample_schema = sra_schema = genbank_schema = genbank_cmt_schema = genbank_src_schema = gisaid_schema = None
# Import schemas
Expand Down
Loading

0 comments on commit 971bfba

Please sign in to comment.