Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add UMLS IDs to taxon #60

Merged
merged 10 commits into from
Oct 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"download_directory": "babel_downloads",
"intermediate_directory": "babel_outputs/intermediate",
"output_directory": "babel_outputs",
"biolink_version": "2.2.7",
"biolink_version": "3.0.3",

"ncbi_files": ["gene2ensembl.gz", "gene_info.gz", "gene_orthologs.gz", "gene_refseq_uniprotkb_collab.gz", "mim2gene_medgen"],
"ubergraph_ontologies": ["UBERON", "CL", "GO", "NCIT", "ECO", "ECTO", "ENVO", "HP", "UPHENO","BFO","BSPO","CARO","CHEBI","CP","GOREL","IAO","MAXO","MONDO","PATO","PR","RO","UBPROP"],
Expand Down Expand Up @@ -44,10 +44,10 @@
"chemical_ids": ["CHEMBL.COMPOUND","GTOPDB","KEGG.COMPOUND","CHEBI","UNII","HMDB","PUBCHEM.COMPOUND","DrugCentral","DRUGBANK","MESH","UMLS"],
"chemical_outputs": ["MolecularMixture.txt", "SmallMolecule.txt", "Polypeptide.txt", "ComplexMolecularMixture.txt", "ChemicalEntity.txt", "ChemicalMixture.txt"],

"taxon_labels": ["NCBITaxon","MESH"],
"taxon_synonyms": ["NCBITaxon"],
"taxon_ids": ["NCBITaxon","MESH"],
"taxon_concords": ["NCBI_MESH"],
"taxon_labels": ["NCBITaxon","MESH","UMLS"],
"taxon_synonyms": ["NCBITaxon","UMLS"],
"taxon_ids": ["NCBITaxon","MESH","UMLS"],
"taxon_concords": ["NCBI_MESH", "UMLS"],
"taxon_outputs": ["OrganismTaxon.txt"],

"genefamily_labels": ["PANTHER.FAMILY","HGNC.FAMILY"],
Expand Down
15 changes: 12 additions & 3 deletions src/babel_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,17 +199,26 @@ def pull_via_urllib(url: str, in_file_name: str, decompress = True, subpath=None
# return the filename to the caller
return out_file_name

def write_compendium(synonym_list,ofname,node_type,labels={}):
def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[]):
"""
:param synonym_list:
:param ofname:
:param node_type:
:param labels:
:param extra_prefixes: We default to only allowing the prefixes allowed for a particular type in Biolink.
If you want to allow additional prefixes, list them here.
:return:
"""
config = get_config()
cdir = config['output_directory']
biolink_version = config['biolink_version']
node_factory = NodeFactory(make_local_name(''),biolink_version)
synonym_factory = SynonymFactory(make_local_name(''))
ic_factory = InformationContentFactory(f'{get_config()["input_directory"]}/icRDF.tsv')
node_test = node_factory.create_node(input_identifiers=[],node_type=node_type,labels={})
node_test = node_factory.create_node(input_identifiers=[],node_type=node_type,labels={},extra_prefixes = extra_prefixes)
with jsonlines.open(os.path.join(cdir,'compendia',ofname),'w') as outf, open(os.path.join(cdir,'synonyms',ofname),'w') as sfile:
for slist in synonym_list:
node = node_factory.create_node(input_identifiers=slist, node_type=node_type,labels = labels)
node = node_factory.create_node(input_identifiers=slist, node_type=node_type,labels = labels, extra_prefixes = extra_prefixes)
if node is not None:
nw = {"type": node['type']}
ic = ic_factory.get_ic(node)
Expand Down
47 changes: 45 additions & 2 deletions src/createcompendia/taxon.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from src.prefixes import NCBITAXON,MESH
from src.prefixes import NCBITAXON,MESH,UMLS
from src.categories import ORGANISM_TAXON

import src.datahandlers.mesh as mesh
import src.datahandlers.umls as umls

from src.babel_utils import read_identifier_file,glom,write_compendium
import src.eutil as eutil
Expand All @@ -21,6 +22,48 @@ def write_mesh_ids(outfile):
#Also add anything from SCR_Chemical, if it doesn't have a tree map
mesh.write_ids(meshmap,outfile,order=[ORGANISM_TAXON],extra_vocab={'SCR_Organism':ORGANISM_TAXON})

def write_umls_ids(outfile):
# UMLS categories that should be classified as taxa:
# - A1.1.3: Eukaryote (https://uts.nlm.nih.gov/uts/umls/semantic-network/T204)
# - A1.1.2: Bacterium (https://uts.nlm.nih.gov/uts/umls/semantic-network/T007)
# - A1.1.3.3: Plant (https://uts.nlm.nih.gov/uts/umls/semantic-network/T002)
# - A1.1.3.2: Fungus (https://uts.nlm.nih.gov/uts/umls/semantic-network/T004)
# - A1.1.3.1.1.3: Fish (https://uts.nlm.nih.gov/uts/umls/semantic-network/T013)
# - A1.1.3.1.1.2: Bird (https://uts.nlm.nih.gov/uts/umls/semantic-network/T012)
# - A1.1.4: Virus (https://uts.nlm.nih.gov/uts/umls/semantic-network/T005)
# - A1.1.3.1.1.4: Mammal (https://uts.nlm.nih.gov/uts/umls/semantic-network/T015)
# - A1.1.3.1.1.5: Reptile (https://uts.nlm.nih.gov/uts/umls/semantic-network/T014)
# - A1.1.3.1.1.1: Amphibian (https://uts.nlm.nih.gov/uts/umls/semantic-network/T011)
# - A1.1.1: Archaeon (https://uts.nlm.nih.gov/uts/umls/semantic-network/T194)
# - A1.1.3.1: Animal (https://uts.nlm.nih.gov/uts/umls/semantic-network/T008)
# - A1.1: Organism (https://uts.nlm.nih.gov/uts/umls/semantic-network/T001)
# - A1.1.3.1.1: Vertebrate (https://uts.nlm.nih.gov/uts/umls/semantic-network/T010)
#
# Not clear if these should be included, so left out for now:
# - A1.1.3.1.1.4.1: Human (https://uts.nlm.nih.gov/uts/umls/semantic-network/T016)
# (presumably the human taxon is represented as _Homo sapiens_, which is http://id.nlm.nih.gov/mesh/D006801)

umlsmap = {x: ORGANISM_TAXON for x in [
'A1.1.3',
'A1.1.2',
'A1.1.3.3',
'A1.1.3.2',
'A1.1.3.1.1.3',
'A1.1.3.1.1.2',
'A1.1.4',
'A1.1.3.1.1.4',
'A1.1.3.1.1.5',
'A1.1.3.1.1.1',
'A1.1.1',
'A1.1.3.1',
'A1.1',
'A1.1.3.1.1'
]}
umls.write_umls_ids(umlsmap,outfile)

def build_taxon_umls_relationships(idfile,outfile):
umls.build_sets(idfile, outfile, {'MSH': MESH, 'NCBITaxon': NCBITAXON})

def build_relationships(outfile,mesh_ids):
regis = mesh.pull_mesh_registry()
with open(mesh_ids,'r') as inf:
Expand All @@ -44,7 +87,7 @@ def build_compendia(concordances, identifiers):
:identifiers: a list of files from which to read identifiers and optional categories"""
dicts = {}
types = {}
uniques = [NCBITAXON,MESH]
uniques = [NCBITAXON,MESH,UMLS]
for ifile in identifiers:
print('loading',ifile)
new_identifiers, new_types = read_identifier_file(ifile)
Expand Down
8 changes: 4 additions & 4 deletions src/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def get_ic(self, node):

class NodeFactory:
def __init__(self,label_dir,biolink_version):
self.toolkit = Toolkit(f'https://raw.githubusercontent.com/biolink/biolink-model/{biolink_version}/biolink-model.yaml')
self.toolkit = Toolkit(f'https://raw.githubusercontent.com/biolink/biolink-model/v{biolink_version}/biolink-model.yaml')
self.ancestor_map = {}
self.prefix_map = {}
self.ignored_prefixes = set()
Expand Down Expand Up @@ -172,12 +172,12 @@ def apply_labels(self, input_identifiers, labels):
labeled_list.append(iid)
return labeled_list

def create_node(self,input_identifiers,node_type,labels={}):
def create_node(self,input_identifiers,node_type,labels={},extra_prefixes=[]):
#This is where we will normalize, i.e. choose the best id, and add types in accord with BL.
#we should also include provenance and version information for the node set build.
#ancestors = self.get_ancestors(node_type)
#ancestors.reverse()
prefixes = self.get_prefixes(node_type)
prefixes = self.get_prefixes(node_type) + extra_prefixes
if len(input_identifiers) == 0:
return None
if len(input_identifiers) > 1000:
Expand Down Expand Up @@ -298,4 +298,4 @@ def pubchemsort(pc_ids, labeled_ids):
if pcid == best_pubchem_id:
best_pubchem = pcelement
pc_ids.remove(best_pubchem)
return [best_pubchem] + pc_ids
return [best_pubchem] + pc_ids
14 changes: 14 additions & 0 deletions src/snakefiles/taxon.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,20 @@ rule taxon_mesh_ids:
run:
taxon.write_mesh_ids(output.outfile)

rule taxon_umls_ids:
output:
outfile=config['intermediate_directory']+"/taxon/ids/UMLS"
run:
taxon.write_umls_ids(output.outfile)

rule get_taxon_umls_relationships:
input:
infile=config['intermediate_directory']+"/taxon/ids/UMLS"
output:
outfile=config['intermediate_directory']+'/taxon/concords/UMLS',
run:
taxon.build_taxon_umls_relationships(input.infile,output.outfile)

rule get_taxon_relationships:
input:
meshfile=config['download_directory']+"/MESH/mesh.nt",
Expand Down