Merge pull request #64 from erikwolfsohn/seqsender_gisaid_dev

GISAID covCLI bugfixes & BioSample/SRA modifications
CDCgov · Sep 6, 2024 · 971bfba · 971bfba
2 parents 60da176 + a4fff17
commit 971bfba
Show file tree

Hide file tree

Showing 9 changed files with 136 additions and 39 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,3 +19,5 @@ docker-compose-*.yaml
 
 # ignore folders
 **/.Rproj.user
+**/test_data/*
+**/gisaid_cli/*
diff --git a/argument_handler.py b/argument_handler.py
@@ -72,7 +72,7 @@ def args_parser():
 		required=True)
 	file_parser.add_argument("--fasta_file",
 		help="Fasta file used to generate submission files; fasta header should match the column 'sequence_name' stored in your metadata. Input either full file path or if just file name it must be stored at '<submission_dir>/<submission_name>/<fasta_file>'.",
-		required=True)
+		default = None)
 	file_parser.add_argument("--table2asn",
 		help="Perform a table2asn submission instead of GenBank FTP submission for organism choices 'FLU' or 'COV'.",
 		required=False,

diff --git a/biosample_sra_handler.py b/biosample_sra_handler.py
@@ -58,7 +58,8 @@ def create_manual_submission_files(database: str, submission_dir: str, metadata:
 		column_ordered = ["sample_name","library_ID"]
 		prefix = "sra-"
 		# Create SRA specific fields
-		metadata["sra-title"] = config_dict["Description"]["Title"]
+		if 'sra-title' not in metadata.columns or metadata['sra-title'].isnull().any() or (metadata['sra-title'] == '').any():
+			metadata["sra-title"] = config_dict["Description"]["Title"]
 		filename_cols = [col for col in metadata.columns.tolist() if re.match("sra-file_[1-9]\d*", col)]
 		# Correct index for filename column
 		for col in filename_cols:
@@ -142,8 +143,10 @@ def create_submission_xml(organism: str, database: str, submission_name: str, co
 			# Remove columns with bs-prefix that are not attributes
 			biosample_cols = [col for col in database_df.columns.tolist() if (col.startswith('bs-')) and (col not in ["bs-sample_name", "bs-package", "bs-description"])]
 			for col in biosample_cols:
-				attribute = etree.SubElement(attributes, "Attribute", attribute_name=col.replace("bs-",""))
-				attribute.text = row[col]
+				attribute_value = row[col]
+				if pd.notnull(attribute_value) and attribute_value.strip() != "":
+					attribute = etree.SubElement(attributes, "Attribute", attribute_name=col.replace("bs-",""))
+					attribute.text = row[col]
 			# Add collection date to Attributes
 			attribute = etree.SubElement(attributes, "Attribute", attribute_name="collection_date")
 			attribute.text = row["collection_date"]
@@ -176,8 +179,10 @@ def create_submission_xml(organism: str, database: str, submission_name: str, co
 			# Remove columns with sra- prefix that are not attributes
 			sra_cols = [col for col in database_df.columns.tolist() if col.startswith('sra-') and not re.match("(sra-sample_name|sra-file_location|sra-file_\d*)", col)]
 			for col in sra_cols:
-				attribute = etree.SubElement(addfiles, "Attribute", name=col.replace("sra-",""))
-				attribute.text = row[col]
+				attribute_value = row[col]
+				if pd.notnull(attribute_value) and attribute_value.strip() != "":
+					attribute = etree.SubElement(addfiles, "Attribute", name=col.replace("sra-",""))
+					attribute.text = row[col]
 			if pd.notnull(row["bioproject"]) and row["bioproject"].strip() != "":
 				attribute_ref_id = etree.SubElement(addfiles, "AttributeRefId", name="BioProject")
 				refid = etree.SubElement(attribute_ref_id, "RefId")

diff --git a/gisaid_handler.py b/gisaid_handler.py
@@ -26,16 +26,22 @@ def create_gisaid_files(organism: str, database: str, submission_name: str, subm
 	# Get column names for gisaid submission only
 	gisaid_df = metadata.filter(regex=GISAID_REGEX).copy()
 	gisaid_df.columns = gisaid_df.columns.str.replace("gs-","").str.strip()
-	#Add required GISAID fields
+	# Add required GISAID fields
+	# covCLI returns an error when authors or collection_date are capitalized 
 	if organism in ["COV", "POX", "ARBO"]:
-		gisaid_df = gisaid_df.rename(columns = {"sample_name": "virus_name", "authors": "Authors", "collection_date": "Collection_Date"})
+		gisaid_df = gisaid_df.rename(columns = {"sample_name": "virus_name"})
+		if organism in ["POX", "ARBO"]:
+			gisaid_df = gisaid_df.rename(columns = {"authors": "Authors", "collection_date": "Collection_Date"})
 		if organism == "COV":
 			prefix_name = "covv_"
 		else:
 			prefix_name = organism.lower() + "_"
 		gisaid_df = gisaid_df.add_prefix(prefix_name)
 		gisaid_df["submitter"] = config_dict["Username"]
-		gisaid_df["fn"] = ""
+		# Copy FASTA headers from sequence_name column to fn column
+		gisaid_df["fn"] = gisaid_df["covv_sequence_name"]
+		# Dropping the original column so it doesn't cause issues later.
+		gisaid_df.drop(columns=['covv_sequence_name'], inplace=True)
 		first_cols = ["submitter", "fn", (prefix_name + "virus_name")]
 	elif "FLU" in organism:
 		gisaid_df = gisaid_df.rename(columns = {"authors": "Authors", "collection_date": "Collection_Date"})
@@ -51,6 +57,9 @@ def create_gisaid_files(organism: str, database: str, submission_name: str, subm
 	# Restructure column order
 	last_cols = [col for col in gisaid_df.columns if col not in first_cols]
 	gisaid_df = gisaid_df[first_cols + last_cols]
+	# If a submission fails and a new one is initiated without cleaning up metadata.csv/orig_metadata.csv, the column prefix doubles up.
+	# This works for now, might set up some housekeeping in the future
+	gisaid_df.columns = gisaid_df.columns.str.replace('^covv_covv_', 'covv_', regex=True)
 	# Create submission files
 	file_handler.save_csv(df=gisaid_df, file_path=submission_dir, file_name="metadata.csv")
 	shutil.copy(os.path.join(submission_dir, "metadata.csv"), os.path.join(submission_dir, "orig_metadata.csv"))
@@ -68,21 +77,50 @@ def process_gisaid_log(log_file: str, submission_dir: str) -> pd.DataFrame:
 		while line:
 			# If accession generated record it
 			# Pattern options: "msg:": "<Sample Name>; <EPI_ISL/EPI_ID>_<Accession Numbers>" OR <epi_isl_id/epi_id>: <Sample Name>; <EPI_ISL/EPI_ID>_<Accession Numbers>
-			if re.match("(?i)(\W|^)(\"msg\":\s*\"\S+.*;\s*(EPI_ISL|EPI_ID)_\d{6,}\"|(epi_id|epi_isl_id):\s*\S.*;\s*(EPI_ISL_|EPI)\d+)(\W|$)", line):
-				gisaid_string = re.sub("(\W|^)\"msg\":\s*\"", "", line)
-				gisaid_string_list: List[str] = gisaid_string.strip().replace("\"", "").split(";")
-				sample_name = re.sub("(epi_isl_id|epi_id):\s*", "", gisaid_string_list[0].strip())
+			# Changed re.match to re.search to return tokens that aren't at the beginning. Changed the regex to capture an arbitrary amount of numbers after EPI_
+			if re.search("(?i)(\W|^)(\"msg\":\s*\"\S+.*;\s*(EPI_ISL|EPI_ID)_\d*\"|(epi_id|epi_isl_id):\s*\S.*;\s*(EPI_ISL_|EPI)\d+)(\W|$)", line):
+				gisaid_string = re.findall(r'(?:[a-zA-Z0-9_-]+(?:/[a-zA-Z0-9_-]+)+|EPI_\w*)', line)
+				gisaid_string = ' '.join(gisaid_string)
+				gisaid_string_list: List[str] = gisaid_string.split(' ')
+				sample_name = gisaid_string_list[0].strip()
 				accession = gisaid_string_list[1].strip()
 				if re.match("EPI_ISL_\d+", accession):
 					gisaid_isolate_log.append({"gs-sample_name":sample_name, "gisaid_accession_epi_isl_id":accession})
 				elif re.match("EPI\d+", accession):
 					gisaid_segment_log.append({"gs-segment_name":sample_name, "gisaid_accession_epi_id":accession})
+			# handling if submitting samples that have already been registered in GISAID
+			elif re.search(r'"code":\s*"validation_error".*?already exists;\s*existing_virus_name:', line):
+				sample_name = re.search(r"(hCoV[^;]+);", line)
+				sample_name = sample_name.group(1) if sample_name else None
+				if re.search(r"\['(EPI_ISL_\d+)'\]", line):
+					accession =  re.search(r"\['(EPI_ISL_\d+)'\]", line)
+					accession = accession.group(1)
+					gisaid_isolate_log.append({"gs-sample_name":sample_name, "gisaid_accession_epi_isl_id":accession})
+				elif re.search(r"\['(EPI_\d+)'\]", line):
+					accession = re.search(r"\['(EPI_\d+)'\]", line)
+					accession = accession.group(1)
+					gisaid_segment_log.append({"gs-segment_name":sample_name, "gisaid_accession_epi_id":accession})
+			else:
+				print("Finished reading GISAID log. If workflow has failed here, it's likely no GISAID IDs were returned. Check results in GISAID upload log.")
 			line = file.readline().strip()
 	gisaid_isolate_df = pd.DataFrame(gisaid_isolate_log)
 	gisaid_segment_df = pd.DataFrame(gisaid_segment_log)
-	upload_log.update_submission_status_csv(submission_dir=submission_dir.replace("/GISAID", "/"), update_database="GISAID", update_df=gisaid_isolate_df)
-	upload_log.update_submission_status_csv(submission_dir=submission_dir.replace("/GISAID", "/"), update_database="GISAID", update_df=gisaid_segment_df)
-	gisaid_isolate_df = gisaid_isolate_df[~gisaid_isolate_df["gisaid_accession_epi_isl_id"].str.contains("EPI_ISL_\d{6,}", regex = True, na = False)].copy()
+
+	# Avoid passing an empty data frame to update_submission_status_csv since it was causing issues.
+	# Output types of GISAID entities found
+	if not gisaid_isolate_df.empty and not gisaid_segment_df.empty:
+		print("GISAID isolates and GISAID segments found.")
+		upload_log.update_submission_status_csv(submission_dir=submission_dir.replace("/GISAID", "/"), update_database="GISAID", update_df=gisaid_isolate_df)
+		upload_log.update_submission_status_csv(submission_dir=submission_dir.replace("/GISAID", "/"), update_database="GISAID", update_df=gisaid_segment_df)
+	elif not gisaid_isolate_df.empty:
+		print("GISAID isolates found.")
+		upload_log.update_submission_status_csv(submission_dir=submission_dir.replace("/GISAID", "/"), update_database="GISAID", update_df=gisaid_isolate_df)
+	elif not gisaid_segment_df.empty:
+		print("GISAID segments found.")
+		upload_log.update_submission_status_csv(submission_dir=submission_dir.replace("/GISAID", "/"), update_database="GISAID", update_df=gisaid_segment_df)
+	else:
+		print("Warning: no GISAID isolates or segments found")
+	gisaid_isolate_df = gisaid_isolate_df[~gisaid_isolate_df["gisaid_accession_epi_isl_id"].str.contains("EPI_ISL_\d*", regex = True, na = False)].copy()
 	return gisaid_isolate_df[["gs-sample_name"]]
 
 # Submit to GISAID
@@ -120,6 +158,7 @@ def submit_gisaid(organism: str, submission_dir: str, submission_name: str, conf
 		while not os.path.exists(log_file):
 			time.sleep(10)
 		# Check submission log to see if all samples are uploaded successfully
+		# Check if not_submitted_df has contents before trying to process it
 		not_submitted_df = process_gisaid_log(log_file=log_file, submission_dir=submission_dir)
 		# If submission completed, no more attempts
 		if not_submitted_df.empty:
@@ -132,10 +171,13 @@ def submit_gisaid(organism: str, submission_dir: str, submission_name: str, conf
 			if "FLU" in organism:
 				column_name = "Isolate_Name"
 			elif "COV" in organism:
-				column_name = "virus_name"
+				column_name = "covv_virus_name"
 			metadata_df = metadata_df.merge(not_submitted_df, how="inner", left_on=column_name, right_on="gs-sample_name")
-			fasta_names = metadata_df["gs-sequence_name"].tolist()
-			metadata_df = metadata_df.drop(columns=["gs-sample_name", "gs-sequence_name"])
+			fasta_names = metadata_df["sequence_name"].tolist()
+			if "sequence_name" in metadata_df.columns:
+				metadata_df.rename(columns={"sequence_name": "fn"}, inplace=True)
+			metadata_df = metadata_df.drop(columns=["gs-sample_name"])
+			metadata_df = metadata_df.columns.str.replace('^covv_covv_', 'covv_', regex=True)
 			metadata_df.to_csv(orig_metadata, header = True, index = False)
 			fasta_dict = []
 			with open(orig_fasta, "r") as fsa:
@@ -163,16 +205,16 @@ def update_gisaid_files(organism: str, submission_dir: str, submission_status_fi
 	orig_fasta = os.path.join(submission_dir, "orig_sequence.fsa")
 	# Filter out genbank that has accession number
 	genbank_status_df = status_df[status_df["genbank-status"].str.contains("processed-ok", na=False)].copy()
-	gisaid_status_df = genbank_status_df[["gs-sample_name", "gs-sequence_name"]]
+	gisaid_status_df = genbank_status_df[["gs-sample_name", "sequence_name"]]
 	# Add required gisaid fields
 	metadata_df = pd.read_csv(metadata, header = 0, dtype = str, engine = "python", encoding="utf-8", index_col=False)
 	if "FLU" in organism:
 		column_name = "Isolate_Name"
 	elif "COV" in organism:
 		column_name = "virus_name"
 	metadata_df = metadata_df.merge(gisaid_status_df, how="inner", left_on=column_name, right_on="gs-sample_name")
-	fasta_names = metadata_df["gs-sequence_name"].tolist()
-	metadata_df = metadata_df.drop(columns=["gs-sample_name", "gs-sequence_name"])
+	fasta_names = metadata_df["sequence_name"].tolist()
+	metadata_df = metadata_df.drop(columns=["gs-sample_name", "sequence_name"])
 	metadata_df.to_csv(orig_metadata, header = True, index = False)
 	fasta_dict = []
 	with open(orig_fasta, "r") as fsa:

diff --git a/ncbi_handler.py b/ncbi_handler.py
@@ -127,9 +127,6 @@ def submit_ncbi(database: str, submission_name: str, submission_dir: str, config
 	tools.check_credentials(config_dict=config_dict, database="NCBI")
 	# Submit sequences to NCBI via FTP Server
 	print(f"Uploading sample files to NCBI-{database}, as a '{submission_type}' submission. If this is not intended, interrupt immediately.", file=sys.stdout)
-	if submission_type != "TEST":
-		print("Error: submission_type " + str(submission_type))
-		sys.exit(1)
 	time.sleep(5)
 	try:
 		# Login into NCBI FTP Server

diff --git a/seqsender.py b/seqsender.py
@@ -25,6 +25,8 @@
 
 from settings import VERSION
 
+import sys
+
 # Define current time
 STARTTIME = datetime.now()
 
@@ -60,6 +62,9 @@ def prep(database: List[str], organism: str, submission_dir: str, submission_nam
 			file_dict[file_type] = updated_path # type: ignore
 	# load config file
 	config_dict = tools.get_config(config_file=file_dict["config_file"], database=database)
+	# warn if user is submitting biosample & sra together with Link_Sample_Between_NCBI_Databases off
+	if not config_dict["NCBI"]["Link_Sample_Between_NCBI_Databases"] and 'SRA' in database and 'BIOSAMPLE' in database:
+		print("Warning: You are submitting to BioSample and SRA together, but Link_Sample_Between_NCBI_Databases is off. If you did not intend to do this, SRA submission is likely to fail.\nIf this was unintentional, cancel the submission and set 'Link_Sample_Between_NCBI_Databases: on' in your config yaml.")
 	# load metadata file
 	metadata = tools.get_metadata(database=database, organism=organism, metadata_file=file_dict["metadata_file"], config_dict=config_dict, skip_validation=skip_validation)
 	# Load fasta file into metadata

diff --git a/settings.py b/settings.py
@@ -54,12 +54,14 @@
 # Genbank metadata regex
 GENBANK_REGEX = "^gb-sample_name$"
 
+# Also added sequence_name to these Genbank entries since I needed it for GISAID, but haven't tested the GenBank workflow yet
 # GenBank source file metadata regex
-GENBANK_REGEX_SRC = "^gb-sample_name$|^src-|^bioproject$|^organism$|^collection_date$"
+GENBANK_REGEX_SRC = "^sequence_name$|^gb-sample_name$|^src-|^bioproject$|^organism$|^collection_date$"
 
 # GenBank comment file metadata regex
-GENBANK_REGEX_CMT = "^gb-sample_name$|^cmt-|^organism$|^collection_date$"
+GENBANK_REGEX_CMT = "^sequence_name$|^gb-sample_name$|^cmt-|^organism$|^collection_date$"
 
 ##### GISAID settings #####
 # GISAID metadata regex
-GISAID_REGEX = "^gs-|^collection_date$|^authors"
+# Added ^sequence_name$ here - it seemed to be getting filtered out too early and causing issues
+GISAID_REGEX = "^gs-|^sequence_name$|^collection_date$|^authors$"
diff --git a/tools.py b/tools.py
@@ -14,6 +14,7 @@
 from datetime import datetime, timedelta
 from cerberus import Validator
 import re
+import shutil
 
 from config.seqsender.seqsender_schema import schema as seqsender_schema
 from settings import SCHEMA_EXCLUSIONS, BIOSAMPLE_REGEX, SRA_REGEX, GISAID_REGEX, GENBANK_REGEX, GENBANK_REGEX_CMT, GENBANK_REGEX_SRC
@@ -132,8 +133,21 @@ def parse_hold_date(config_dict: Dict[str, Any]):
 def get_metadata(database: List[str], organism: str, metadata_file: str, config_dict: Dict[str, Any], skip_validation: bool = False) -> pd.DataFrame:
 	# Read in metadata file
 	metadata = file_handler.load_csv(metadata_file)
+	# Adding some default handling for bs-description here - it's not mandatory for biosample submission, but is required in the workflow
+	if "BIOSAMPLE" in database:
+		if 'bs-description' not in metadata.columns:
+		# Create 'bs-description' by joining 'organism', 'bs-host', and 'bs-host_disease' columns
+			metadata['bs-description'] = metadata[['organism', 'bs-host', 'bs-host_disease']].apply(
+			lambda row: ' '.join(row.dropna().astype(str)), axis=1
+		)
+	# If there are empty biosample or sra columns in the metadata sheet, assume they are unused optional columns and drop them before pandera attempts to validate
+	columns_to_drop = [col for col in metadata.columns if (col.startswith('bs-') or col.startswith('sra-')) and metadata[col].replace('', pd.NA).apply(lambda x: x.strip() if isinstance(x, str) else x).isna().all()]
+	if columns_to_drop:
+		metadata = metadata.drop(columns=columns_to_drop)
+		shutil.copy(metadata_file, metadata_file + ".bak")
+		file_handler.save_csv(metadata, metadata_file)
 	# Update seqsender base schema to include needed checks
-	if "BioSample" in database or "SRA" in database:
+	if "BIOSAMPLE" in database or "SRA" in database:
 		seqsender_schema.update_columns({"bioproject":{"checks":Check.str_matches(r"^(?!\s*$).+"),"nullable":False,"required":True}})
 	biosample_schema = sra_schema = genbank_schema = genbank_cmt_schema = genbank_src_schema = gisaid_schema = None
 	# Import schemas