From b184e232a3bc987b31c4da95b415e9f8713d5d6e Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 26 Oct 2023 15:47:19 -0400 Subject: [PATCH 1/3] Implement PMID search using CLI --- indra/literature/pubmed_client.py | 34 ++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/indra/literature/pubmed_client.py b/indra/literature/pubmed_client.py index 7971c766b6..7ceb10f963 100644 --- a/indra/literature/pubmed_client.py +++ b/indra/literature/pubmed_client.py @@ -3,7 +3,7 @@ """ import logging import random - +import subprocess import requests from time import sleep from typing import List @@ -850,3 +850,35 @@ def get_substance_annotations(pubmed_id: str) -> List[str]: for c in list(node) for b in c.iter('*') if 'UI' in b.attrib] return uid + + +def get_all_ids(search_term): + """Return all PMIDs for a search term using the edirect CLI. + + This function complements the `get_id` function which uses the PubMed + REST API but is limited to 10k results and is very difficult to + generalize to systematically fetch all IDs if there are more than 10k + results. This function uses the edirect CLI which implements logic + for paging over results. + + This function only works if edirect is installed and is on your PATH. + See https://www.ncbi.nlm.nih.gov/books/NBK179288/ for instructions. + + Parameters + ---------- + search_term : str + A term for which the PubMed search should be performed. + + Returns + ------- + list[str] + A list of PMIDs for the given search term. + """ + cmd = f'esearch -db pubmed -query "{search_term}" | efetch -format uid' + res = subprocess.getoutput(cmd) + # Output is divided by new lines + elements = res.split('\n') + # If there are more than 10k IDs, the CLI outputs a . for each + # iteration, these have to be filtered out + pmids = [e for e in elements if '.' not in e] + return pmids \ No newline at end of file From 4cdadfb82892e8a5d979b6dfe8b67519c2fa944b Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 26 Oct 2023 15:48:38 -0400 Subject: [PATCH 2/3] Add note on CLI to get_ids --- indra/literature/pubmed_client.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/indra/literature/pubmed_client.py b/indra/literature/pubmed_client.py index 7ceb10f963..ad0933fd4a 100644 --- a/indra/literature/pubmed_client.py +++ b/indra/literature/pubmed_client.py @@ -67,6 +67,10 @@ def get_ids(search_term, **kwargs): that can be changed via the corresponding keyword argument. Note also the retstart argument along with retmax to page across batches of IDs. + PubMed's REST API makes it difficult to retrieve more than 10k + PMIDs systematically. See the `get_all_ids` function in this module + for a way to retrieve more than 10k IDs using the PubMed edirect CLI. + Parameters ---------- search_term : str From 07fed58ffbc13b457fe4394850b3df840e51db81 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sat, 18 Nov 2023 10:31:05 -0500 Subject: [PATCH 3/3] Add wrapper to get metadata for any number of PMIDs --- indra/literature/pubmed_client.py | 51 ++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/indra/literature/pubmed_client.py b/indra/literature/pubmed_client.py index ad0933fd4a..c5e20ca274 100644 --- a/indra/literature/pubmed_client.py +++ b/indra/literature/pubmed_client.py @@ -1,6 +1,8 @@ """ Search and get metadata for articles in Pubmed. """ +import time +import tqdm import logging import random import subprocess @@ -10,7 +12,7 @@ from functools import lru_cache import xml.etree.ElementTree as ET from indra.util import UnicodeXMLTreeBuilder as UTB -from indra.util import pretty_save_xml +from indra.util import batch_iter, pretty_save_xml logger = logging.getLogger(__name__) @@ -779,6 +781,53 @@ def get_metadata_for_ids(pmid_list, get_issns_from_nlm=False, references_included=references_included) +def get_metadata_for_all_ids(pmid_list, get_issns_from_nlm=False, + get_abstracts=False, prepend_title=False, + detailed_authors=False, references_included=None): + """Get article metadata for up to 200 PMIDs from the Pubmed database. + + Parameters + ---------- + pmid_list : list of str + Can contain any number of PMIDs. + get_issns_from_nlm : bool + Look up the full list of ISSN number for the journal associated with + the article, which helps to match articles to CrossRef search results. + Defaults to False, since it slows down performance. + get_abstracts : bool + Indicates whether to include the Pubmed abstract in the results. + prepend_title : bool + If get_abstracts is True, specifies whether the article title should + be prepended to the abstract text. + detailed_authors : bool + If True, extract as many of the author details as possible, such as + first name, identifiers, and institutions. If false, only last names + are returned. Default: False + references_included : Optional[str] + If 'detailed', include detailed references in the results. If 'pmid', only include + the PMID of the reference. If None, don't include references. Default: None + + Returns + ------- + dict of dicts + Dictionary indexed by PMID. Each value is a dict containing the + following fields: 'doi', 'title', 'authors', 'journal_title', + 'journal_abbrev', 'journal_nlm_id', 'issn_list', 'page'. + """ + all_metadata = {} + for ids in tqdm.tqdm(batch_iter(pmid_list, 200), desc='Retrieving metadata'): + time.sleep(0.1) + metadata = get_metadata_for_ids(list(ids), + get_issns_from_nlm=get_issns_from_nlm, + get_abstracts=get_abstracts, + prepend_title=prepend_title, + detailed_authors=detailed_authors, + references_included=references_included) + if metadata is not None: + all_metadata.update(metadata) + return all_metadata + + @lru_cache(maxsize=1000) def get_issns_for_journal(nlm_id): """Get a dict of the ISSN numbers for a journal given its NLM ID.