From b184e232a3bc987b31c4da95b415e9f8713d5d6e Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 26 Oct 2023 15:47:19 -0400
Subject: [PATCH 1/3] Implement PMID search using CLI

---
 indra/literature/pubmed_client.py | 34 ++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/indra/literature/pubmed_client.py b/indra/literature/pubmed_client.py
index 7971c766b6..7ceb10f963 100644
--- a/indra/literature/pubmed_client.py
+++ b/indra/literature/pubmed_client.py
@@ -3,7 +3,7 @@
 """
 import logging
 import random
-
+import subprocess
 import requests
 from time import sleep
 from typing import List
@@ -850,3 +850,35 @@ def get_substance_annotations(pubmed_id: str) -> List[str]:
            for c in list(node) for b in c.iter('*')
            if 'UI' in b.attrib]
     return uid
+
+
+def get_all_ids(search_term):
+    """Return all PMIDs for a search term using the edirect CLI.
+
+    This function complements the `get_id` function which uses the PubMed
+    REST API but is limited to 10k results and is very difficult to
+    generalize to systematically fetch all IDs if there are more than 10k
+    results. This function uses the edirect CLI which implements logic
+    for paging over results.
+
+    This function only works if edirect is installed and is on your PATH.
+    See https://www.ncbi.nlm.nih.gov/books/NBK179288/ for instructions.
+
+    Parameters
+    ----------
+    search_term : str
+        A term for which the PubMed search should be performed.
+
+    Returns
+    -------
+    list[str]
+        A list of PMIDs for the given search term.
+    """
+    cmd = f'esearch -db pubmed -query "{search_term}" | efetch -format uid'
+    res = subprocess.getoutput(cmd)
+    # Output is divided by new lines
+    elements = res.split('\n')
+    # If there are more than 10k IDs, the CLI outputs a . for each
+    # iteration, these have to be filtered out
+    pmids = [e for e in elements if '.' not in e]
+    return pmids
\ No newline at end of file

From 4cdadfb82892e8a5d979b6dfe8b67519c2fa944b Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 26 Oct 2023 15:48:38 -0400
Subject: [PATCH 2/3] Add note on CLI to get_ids

---
 indra/literature/pubmed_client.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/indra/literature/pubmed_client.py b/indra/literature/pubmed_client.py
index 7ceb10f963..ad0933fd4a 100644
--- a/indra/literature/pubmed_client.py
+++ b/indra/literature/pubmed_client.py
@@ -67,6 +67,10 @@ def get_ids(search_term, **kwargs):
     that can be changed via the corresponding keyword argument. Note also
     the retstart argument along with retmax to page across batches of IDs.
 
+    PubMed's REST API makes it difficult to retrieve more than 10k
+    PMIDs systematically. See the `get_all_ids` function in this module
+    for a way to retrieve more than 10k IDs using the PubMed edirect CLI.
+
     Parameters
     ----------
     search_term : str

From 07fed58ffbc13b457fe4394850b3df840e51db81 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Sat, 18 Nov 2023 10:31:05 -0500
Subject: [PATCH 3/3] Add wrapper to get metadata for any number of PMIDs

---
 indra/literature/pubmed_client.py | 51 ++++++++++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/indra/literature/pubmed_client.py b/indra/literature/pubmed_client.py
index ad0933fd4a..c5e20ca274 100644
--- a/indra/literature/pubmed_client.py
+++ b/indra/literature/pubmed_client.py
@@ -1,6 +1,8 @@
 """
 Search and get metadata for articles in Pubmed.
 """
+import time
+import tqdm
 import logging
 import random
 import subprocess
@@ -10,7 +12,7 @@
 from functools import lru_cache
 import xml.etree.ElementTree as ET
 from indra.util import UnicodeXMLTreeBuilder as UTB
-from indra.util import pretty_save_xml
+from indra.util import batch_iter, pretty_save_xml
 
 
 logger = logging.getLogger(__name__)
@@ -779,6 +781,53 @@ def get_metadata_for_ids(pmid_list, get_issns_from_nlm=False,
                                       references_included=references_included)
 
 
+def get_metadata_for_all_ids(pmid_list, get_issns_from_nlm=False,
+                             get_abstracts=False, prepend_title=False,
+                             detailed_authors=False, references_included=None):
+    """Get article metadata for up to 200 PMIDs from the Pubmed database.
+
+    Parameters
+    ----------
+    pmid_list : list of str
+        Can contain any number of PMIDs.
+    get_issns_from_nlm : bool
+        Look up the full list of ISSN number for the journal associated with
+        the article, which helps to match articles to CrossRef search results.
+        Defaults to False, since it slows down performance.
+    get_abstracts : bool
+        Indicates whether to include the Pubmed abstract in the results.
+    prepend_title : bool
+        If get_abstracts is True, specifies whether the article title should
+        be prepended to the abstract text.
+    detailed_authors : bool
+        If True, extract as many of the author details as possible, such as
+        first name, identifiers, and institutions. If false, only last names
+        are returned. Default: False
+    references_included : Optional[str]
+        If 'detailed', include detailed references in the results. If 'pmid', only include
+        the PMID of the reference. If None, don't include references. Default: None
+
+    Returns
+    -------
+    dict of dicts
+        Dictionary indexed by PMID. Each value is a dict containing the
+        following fields: 'doi', 'title', 'authors', 'journal_title',
+        'journal_abbrev', 'journal_nlm_id', 'issn_list', 'page'.
+    """
+    all_metadata = {}
+    for ids in tqdm.tqdm(batch_iter(pmid_list, 200), desc='Retrieving metadata'):
+        time.sleep(0.1)
+        metadata = get_metadata_for_ids(list(ids),
+                                        get_issns_from_nlm=get_issns_from_nlm,
+                                        get_abstracts=get_abstracts,
+                                        prepend_title=prepend_title,
+                                        detailed_authors=detailed_authors,
+                                        references_included=references_included)
+        if metadata is not None:
+            all_metadata.update(metadata)
+    return all_metadata
+
+
 @lru_cache(maxsize=1000)
 def get_issns_for_journal(nlm_id):
     """Get a dict of the ISSN numbers for a journal given its NLM ID.