From 5061a4291d2d0da95de4e4f6ef6b8113f7770f6c Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 14 Nov 2023 11:58:05 +0100 Subject: [PATCH] Add discovery algorithm (#93) This PR adds a workflow that iterates through a list of URIs and tries to discovery new URI prefixes --- docs/source/discovery.rst | 256 +++++++++++++++++++++++++++++++++++++ docs/source/index.rst | 1 + src/curies/__init__.py | 4 + src/curies/discovery.py | 257 ++++++++++++++++++++++++++++++++++++++ tests/test_discovery.py | 82 ++++++++++++ 5 files changed, 600 insertions(+) create mode 100644 docs/source/discovery.rst create mode 100644 src/curies/discovery.py create mode 100644 tests/test_discovery.py diff --git a/docs/source/discovery.rst b/docs/source/discovery.rst new file mode 100644 index 0000000..b85d6c9 --- /dev/null +++ b/docs/source/discovery.rst @@ -0,0 +1,256 @@ +URI Prefix Discovery +==================== +.. automodule:: curies.discovery + +Discovering URI Prefixes from an Ontology +----------------------------------------- +A common place where discovering URI prefixes is important is when working with new ontologies. +In the following example, we look at the `Academic Event Ontology (AEON) `_. This +is an ontology developed under OBO Foundry principles describing academic events. Accordingly, it includes many +URI references to terms in OBO Foundry ontologies. + +In this tutorial, we use :func:`curies.discover` (and then :func:`curies.discover_from_rdf` as a nice convenience +function) to load the ontology in the RDF/XML format and discover putative URI prefixes. + +.. code-block:: python + + import curies + from curies.discovery import get_uris_from_rdf + + ONTOLOGY_URL = "https://raw.githubusercontent.com/tibonto/aeon/main/aeon.owl" + + uris = get_uris_from_rdf(ONTOLOGY_URL, format="xml") + discovered_converter = curies.discover(uris) + # note, these two steps can be combine with curies.discover_from_rdf, + # and we'll do that in the following examples + +We discovered the fifty URI prefixes in the following table. Many of them appear to be OBO Foundry URI prefixes or +semantic web prefixes, so in the next step, we'll use prior knowledge to reduce the false discovery rate. + +============== ============================================================================== +curie_prefix uri_prefix +============== ============================================================================== +ns1 ``http://ontologydesignpatterns.org/wiki/Community:Parts_and_`` +ns2 ``http://protege.stanford.edu/plugins/owl/protege#`` +ns3 ``http://purl.obolibrary.org/obo/AEON_`` +ns4 ``http://purl.obolibrary.org/obo/APOLLO_SV_`` +ns5 ``http://purl.obolibrary.org/obo/BFO_`` +ns6 ``http://purl.obolibrary.org/obo/CRO_`` +ns7 ``http://purl.obolibrary.org/obo/ENVO_`` +ns8 ``http://purl.obolibrary.org/obo/IAO_`` +ns9 ``http://purl.obolibrary.org/obo/ICO_`` +ns10 ``http://purl.obolibrary.org/obo/NCBITaxon_`` +ns11 ``http://purl.obolibrary.org/obo/OBIB_`` +ns12 ``http://purl.obolibrary.org/obo/OBI_`` +ns13 ``http://purl.obolibrary.org/obo/OMO_`` +ns14 ``http://purl.obolibrary.org/obo/OOSTT_`` +ns15 ``http://purl.obolibrary.org/obo/RO_`` +ns16 ``http://purl.obolibrary.org/obo/TXPO_`` +ns17 ``http://purl.obolibrary.org/obo/bfo/axiom/`` +ns18 ``http://purl.obolibrary.org/obo/valid_for_`` +ns19 ``http://purl.obolibrary.org/obo/valid_for_go_`` +ns20 ``http://purl.obolibrary.org/obo/valid_for_go_annotation_`` +ns21 ``http://purl.obolibrary.org/obo/wikiCFP_`` +ns22 ``http://purl.org/dc/elements/1.1/`` +ns23 ``http://purl.org/dc/terms/`` +ns24 ``http://usefulinc.com/ns/doap#`` +ns25 ``http://wiki.geneontology.org/index.php/Involved_`` +ns26 ``http://www.geneontology.org/formats/oboInOwl#`` +ns27 ``http://www.geneontology.org/formats/oboInOwl#created_`` +ns28 ``http://www.w3.org/1999/02/22-rdf-syntax-ns#`` +ns29 ``http://www.w3.org/2000/01/rdf-schema#`` +ns30 ``http://www.w3.org/2001/XMLSchema#`` +ns31 ``http://www.w3.org/2002/07/owl#`` +ns32 ``http://www.w3.org/2003/11/swrl#`` +ns33 ``http://www.w3.org/2004/02/skos/core#`` +ns34 ``http://www.w3.org/ns/prov#`` +ns35 ``http://xmlns.com/foaf/0.1/`` +ns36 ``https://en.wikipedia.org/wiki/Allen%27s_interval_`` +ns37 ``https://groups.google.com/d/msg/bfo-owl-devel/s9Uug5QmAws/ZDRnpiIi_`` +ns38 ``https://ror.org/`` +ns39 ``https://w3id.org/scholarlydata/ontology/conference-ontology.owl#`` +ns40 ``https://w3id.org/seo#`` +ns41 ``https://www.confident-conference.org/index.php/Academic_Field:Information_`` +ns42 ``https://www.confident-conference.org/index.php/Event:VIVO_`` +ns43 ``https://www.confident-conference.org/index.php/Event:VIVO_2021_`` +ns44 ``https://www.confident-conference.org/index.php/Event:VIVO_2021_orga_`` +ns45 ``https://www.confident-conference.org/index.php/Event:VIVO_2021_talk1_`` +ns46 ``https://www.confident-conference.org/index.php/Event:VIVO_2021_talk2_`` +ns47 ``https://www.confident-conference.org/index.php/Event_Series:VIVO_`` +ns48 ``https://www.wikidata.org/wiki/`` +ns49 ``https://www.wikidata.org/wiki/Wikidata:Property_proposal/colocated_`` +ns50 ``urn:swrl#`` +============== ============================================================================== + +In the following block, we chain together (extended) prefix maps from the OBO Foundry as well as +a "semantic web" prefix map to try and reduce the number of false positives by passing them +through the ``converter`` keyword argument. + +.. code-block:: python + + import curies + + ONTOLOGY_URL = "https://raw.githubusercontent.com/tibonto/aeon/main/aeon.owl" + SEMWEB_URL = "https://raw.githubusercontent.com/biopragmatics/bioregistry/main/exports/contexts/semweb.context.jsonld" + + base_converter = curies.chain([ + curies.load_jsonld_context(SEMWEB_URL), + curies.get_obo_converter(), + ]) + + discovered_converter = curies.discover_from_rdf( + ONTOLOGY_URL, format="xml", converter=base_converter + ) + +We reduced the number of putative URI prefixes in half in the following table. However, we can still identify +some putative URI prefixes that likely would have appeared in a more comprehensive (extended) prefix map such +as the Bioregistry such as: + +- ``https://ror.org/`` for the `Research Organization Registry (ROR) `_ +- ``https://w3id.org/seo#`` for the `Scientific Event Ontology (SEO) `_ +- ``http://usefulinc.com/ns/doap#`` for the `Description of a Project (DOAP) vocabulary `_ + +Despite this, we're on our way! It's also obvious that several of the remaining putative URI prefixes come from +non-standard usage of the OBO PURL system (e.g., ``http://purl.obolibrary.org/obo/valid_for_go_annotation_``) +and some are proper false positives due to using ``_`` as a delimiter +(e.g., ``https://www.confident-conference.org/index.php/Event:VIVO_2021_talk2_``). + +============== ============================================================================== +curie_prefix uri_prefix +============== ============================================================================== +ns1 ``http://ontologydesignpatterns.org/wiki/Community:Parts_and_`` +ns2 ``http://protege.stanford.edu/plugins/owl/protege#`` +ns3 ``http://purl.obolibrary.org/obo/AEON_`` +ns4 ``http://purl.obolibrary.org/obo/bfo/axiom/`` +ns5 ``http://purl.obolibrary.org/obo/valid_for_`` +ns6 ``http://purl.obolibrary.org/obo/valid_for_go_`` +ns7 ``http://purl.obolibrary.org/obo/valid_for_go_annotation_`` +ns8 ``http://purl.obolibrary.org/obo/wikiCFP_`` +ns9 ``http://usefulinc.com/ns/doap#`` +ns10 ``http://wiki.geneontology.org/index.php/Involved_`` +ns11 ``https://en.wikipedia.org/wiki/Allen%27s_interval_`` +ns12 ``https://groups.google.com/d/msg/bfo-owl-devel/s9Uug5QmAws/ZDRnpiIi_`` +ns13 ``https://ror.org/`` +ns14 ``https://w3id.org/scholarlydata/ontology/conference-ontology.owl#`` +ns15 ``https://w3id.org/seo#`` +ns16 ``https://www.confident-conference.org/index.php/Academic_Field:Information_`` +ns17 ``https://www.confident-conference.org/index.php/Event:VIVO_`` +ns18 ``https://www.confident-conference.org/index.php/Event:VIVO_2021_`` +ns19 ``https://www.confident-conference.org/index.php/Event:VIVO_2021_orga_`` +ns20 ``https://www.confident-conference.org/index.php/Event:VIVO_2021_talk1_`` +ns21 ``https://www.confident-conference.org/index.php/Event:VIVO_2021_talk2_`` +ns22 ``https://www.confident-conference.org/index.php/Event_Series:VIVO_`` +ns23 ``https://www.wikidata.org/wiki/`` +ns24 ``https://www.wikidata.org/wiki/Wikidata:Property_proposal/colocated_`` +ns25 ``urn:swrl#`` +============== ============================================================================== + +As a final step in our iterative journey of URI prefix discovery, we're going to use a cutoff for a minimum +of two appearances of a URI prefix to reduce the most spurious false positives. + +.. code-block:: python + + import curies + + ONTOLOGY_URL = "https://raw.githubusercontent.com/tibonto/aeon/main/aeon.owl" + SEMWEB_URL = "https://raw.githubusercontent.com/biopragmatics/bioregistry/main/exports/contexts/semweb.context.jsonld" + + base_converter = curies.chain([ + curies.load_jsonld_context(SEMWEB_URL), + curies.get_obo_converter(), + ]) + + discovered_converter = curies.discover_from_rdf( + ONTOLOGY_URL, format="xml", converter=base_converter, cutoff=2 + ) + +We have reduced the list to a manageable set of 9 putative URI prefixes in the following table. + +============== ========================================================================= +curie_prefix uri_prefix +============== ========================================================================= +ns1 ``http://purl.obolibrary.org/obo/AEON_`` +ns2 ``http://purl.obolibrary.org/obo/bfo/axiom/`` +ns3 ``http://purl.obolibrary.org/obo/valid_for_go_`` +ns4 ``https://w3id.org/scholarlydata/ontology/conference-ontology.owl#`` +ns5 ``https://w3id.org/seo#`` +ns6 ``https://www.confident-conference.org/index.php/Event:VIVO_2021_`` +ns7 ``https://www.confident-conference.org/index.php/Event:VIVO_2021_talk1_`` +ns8 ``https://www.confident-conference.org/index.php/Event:VIVO_2021_talk2_`` +ns9 ``urn:swrl#`` +============== ========================================================================= + +Here are the calls to be made: + +- ``ns1`` represents the AEON vocabulary itself and should be given the ``aeon`` prefix. +- ``ns2`` and ``ns3``` are all false positives +- ``ns6``, ``ns7``, and ``ns8`` are a tricky case - they have a meaningful overlap that can't be easily + automatically detected (yet). In this case, it makes the most sense to add the shortest one manually + to the base converter with some unique name (don't use ``ns6`` as it will cause conflicts later), like in: + + .. code-block:: python + + base_converter = curies.chain([ + curies.load_jsonld_context(SEMWEB_URL), + curies.get_obo_converter(), + curies.load_prefix_map({"confident_event_vivo_2021": "https://www.confident-conference.org/index.php/Event:VIVO_2021_"}), + ]) + + In reality, these are all part of the `ConfIDent Event `_ vocabulary, + which has the URI prefix ``https://www.confident-conference.org/index.php/Event:``. +- ``ns4`` represents the `Conference Ontology `_ and + should be given the ``conference`` prefix. +- ``ns5`` represents the `Scientific Event Ontology (SEO) `_ and + should be given the ``seo`` prefix. +- ``ns9`` represents the `Semantic Web Rule Language `_, + though using URNs is an interesting choice in serialization. + +After we've made these calls, it's a good idea to write an (extended) prefix map. In this case, since we aren't working +with CURIE prefix synonyms nor URI prefix synonyms, it's okay to write a simple prefix map or a JSON-LD context without +losing information. + +.. note:: + + Postscript: throughout this guide, we used the following Python code to create the RST tables: + + .. code-block:: python + + def print_converter(converter) -> None: + from tabulate import tabulate + rows = sorted( + [ + (record.prefix, f"``{record.uri_prefix}``") + for record in discovered_converter.records + ], + key=lambda t: int(t[0].removeprefix("ns")), + ) + print(tabulate(rows, headers=["curie_prefix", "uri_prefix"], tablefmt="rst")) + +Just Make It Work, or, A Guide to Being a Questionable Semantic Citizen +----------------------------------------------------------------------- +The goal of the :mod:`curies` package is to provide the tools towards making semantically well-defined data, +which has a meaningful (extended) prefix map associated with it. Maybe you're in an organization that doesn't really +care about the utility of nice prefix maps, and just wants to get the job done where you need to turn URIs into _some_ +CURIEs. + +Here's a recipe for doing this, based on the last example with AEON: + +.. code-block:: python + + import curies + + ONTOLOGY_URL = "https://raw.githubusercontent.com/tibonto/aeon/main/aeon.owl" + + # Use the Bioregistry as a base prefix since it's the most comprehensive one + base_converter = curies.get_bioregistry_converter() + + # Only discover what the Bioregistry doesn't already have + discovered_converter = curies.discover_from_rdf( + ONTOLOGY_URL, format="xml", converter=base_converter + ) + + # Chain together the base converter with the discoveries + augmented_converter = curies.chain([base_converter, discovered_converter]) + +With the augmented converter, you can now convert all URIs in the ontology into CURIEs. They will have a smattering +of unintelligible prefixes with no meaning, but at least the job is done! diff --git a/docs/source/index.rst b/docs/source/index.rst index 76082d7..8bbe6ba 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -63,6 +63,7 @@ for updating your code. tutorial reconciliation + discovery struct api services/index diff --git a/src/curies/__init__.py b/src/curies/__init__.py index 4ecb149..350a221 100644 --- a/src/curies/__init__.py +++ b/src/curies/__init__.py @@ -19,6 +19,7 @@ write_jsonld_context, write_shacl, ) +from .discovery import discover, discover_from_rdf from .reconciliation import remap_curie_prefixes, remap_uri_prefixes, rewire from .sources import ( get_bioregistry_converter, @@ -56,4 +57,7 @@ "get_monarch_converter", "get_go_converter", "get_bioregistry_converter", + # discovery + "discover", + "discover_from_rdf", ] diff --git a/src/curies/discovery.py b/src/curies/discovery.py new file mode 100644 index 0000000..cf8919e --- /dev/null +++ b/src/curies/discovery.py @@ -0,0 +1,257 @@ +"""Discovery new entries for a Converter. + +The :func:`curies.discover` functionality is intended to be used in a "data science" workflow. Its goal is to +enable a data scientist to semi-interactively explore data (e.g., coming from an ontology, SSSOM, RDF) +that doesn't come with a complete (extended) prefix map and identify common URI prefixes. + +It returns the discovered URI prefixes in a :class:`curies.Converter` object with "dummy" CURIE prefixes. +This makes it possible to convert the URIs appearing in the data into CURIEs and therefore enables +their usage in places where CURIEs are expected. + +However, it's suggested that after discovering URI prefixes, the data scientist more carefully +constructs a meaningful prefix map based on the discovered one. This might include some or all of the +following steps: + +1. Replace dummy CURIE prefixes with meaningful ones +2. Remove spurious URI prefixes that appear but do not represent a semantic space. This happens often + due to using ``_`` as a delimiter or having a frequency cutoff of zero (see the parameters for this function). +3. Consider chaining a comprehensive extended prefix map such as the Bioregistry + (from :func:`curies.get_bioregistry_converter`) with onto the converter passed to this function + so pre-existing URI prefixes are not *re-discovered*. + +Finally, you should save the prefix map that you create in a persistent place (i.e., inside a JSON file) such +that it can be reused. + +Algorithm +--------- +The :func:`curies.discover` function implements the following algorithm that does the following for each URI: + +1. For each delimiter (in the priority order they are given) check if the delimiter is present. +2. If it's present, split the URI into two parts based on rightmost appearance of the delimiter. +3. If the right part after splitting is all alphanumeric characters, save the URI prefix (with delimiter attached) +4. If a delimiter is successfully used to identify a URI prefix, don't check any of the following delimiters + +After identifying putative URI prefixes, the second part of the algorithm does the following: + +1. If a cutoff was provided, remove all putative URI prefixes for which there were fewer examples than the cutoff +2. Sort the URI prefixes lexicographically (i.e., with :func:`sorted`) +3. Assign a dummy CURIE prefix to each URI prefix, counting upwards from 1 +4. Construct a converter from this prefix map and return it +""" + +from collections import defaultdict +from pathlib import PurePath +from typing import IO, TYPE_CHECKING, Any, Iterable, Mapping, Optional, Sequence, Set, TextIO, Union + +from typing_extensions import Literal + +from curies import Converter, Record + +if TYPE_CHECKING: + import rdflib + +__all__ = [ + "discover", + "discover_from_rdf", +] + + +GraphFormats = Literal["turtle", "xml", "n3", "nt", "trix"] +GraphInput = Union[IO[bytes], TextIO, "rdflib.parser.InputSource", str, bytes, PurePath] + + +def discover_from_rdf( + graph: Union[GraphInput, "rdflib.Graph"], + *, + format: Optional[GraphFormats] = None, + **kwargs: Any, +) -> Converter: + """Discover new URI prefixes from RDF content via :mod:`rdflib`. + + This function works the same as :func:`discover`, but gets its URI list from + a triple store. See :func:`discover` for a more detailed explanation of how this + algorithm works. + + :param graph: + Either a pre-instantiated RDFlib graph, or an input type to the ``source`` + keyword of :meth:`rdflib.Graph.parse`. This can be one of the following: + + - A string or bytes representation of a URL + - A string, bytes, or Path representation of a local file + - An I/O object that can be read directly + - An open XML reader from RDFlib (:class:`rdflib.parser.InputSource`) + :param format: If ``graph`` is given as a URL or I/O object, this + is passed through to the ``format`` keyword of :meth:`rdflib.Graph.parse`. + If none is given, defaults to ``turtle``. + :param kwargs: Keyword arguments passed through to :func:`discover` + :returns: + A converter with dummy prefixes for URI prefixes appearing in the RDF + content (i.e., triples). + """ + uris = get_uris_from_rdf(graph=graph, format=format) + return discover(uris, **kwargs) + + +def get_uris_from_rdf( + graph: Union[GraphInput, "rdflib.Graph"], *, format: Optional[GraphFormats] = None +) -> Set[str]: + """Get a set of URIs from a graph.""" + graph = _ensure_graph(graph=graph, format=format) + return set(_yield_uris(graph=graph)) + + +def _ensure_graph( + *, graph: Union[GraphInput, "rdflib.Graph"], format: Optional[GraphFormats] = None +) -> "rdflib.Graph": + import rdflib + + if not isinstance(graph, rdflib.Graph): + _temp_graph = rdflib.Graph() + _temp_graph.parse(source=graph, format=format) + graph = _temp_graph + + return graph + + +def _yield_uris(*, graph: "rdflib.Graph") -> Iterable[str]: + import rdflib + + for parts in graph.triples((None, None, None)): + for part in parts: + if isinstance(part, rdflib.URIRef): + yield str(part) + + +def discover( + uris: Iterable[str], + *, + delimiters: Optional[Sequence[str]] = None, + cutoff: Optional[int] = None, + metaprefix: str = "ns", + converter: Optional[Converter] = None, +) -> Converter: + """Discover new URI prefixes and construct a converter with a unique dummy CURIE prefix for each. + + :param uris: + An iterable of URIs to search through. Will be taken as a set and + each unique entry is only considered once. + :param delimiters: + The character(s) that delimit a URI prefix from a local unique identifier. + If none given, defaults to using ``/``, ``#``, and ``_``. For example: + + - ``/`` is the delimiter in ``https://www.ncbi.nlm.nih.gov/pubmed/37929212``, which separates + the URI prefix ``https://www.ncbi.nlm.nih.gov/pubmed/`` from the local unique identifier + `37929212 `_ for the article + "New insights into osmobiosis and chemobiosis in tardigrades" in PubMed. + - ``#`` is the delimiter in ``http://www.w3.org/2000/01/rdf-schema#label``, which separates + the URI prefix ``http://www.w3.org/2000/01/rdf-schema#`` from the local unique identifier + `label `_ for the term "label" in the RDF Schema. + The ``#`` typically is used in a URL to denote a fragment and commonly appears in small semantic + web vocabularies that are shown as a single HTML page. + - ``_`` is the delimiter in ``http://purl.obolibrary.org/obo/GO_0032571``, which separates + the URI prefix ``http://purl.obolibrary.org/obo/GO_`` from the local unique identifier + `0032571 `_ for the term "response to vitamin K" + in the Gene Ontology + + .. note:: The delimiter is itself a part of the URI prefix + :param cutoff: + If given, will require more than ``cutoff`` unique local unique identifiers + associated with a given URI prefix to keep it. + + Defaults to zero, which increases recall (i.e., likelihood of getting all + possible URI prefixes) but decreases precision (i.e., more of the results + might be false positives / spurious). If you get a lot of false positives, + try increasing first to 1, 2, then maybe higher. + :param metaprefix: + The beginning part of each dummy prefix, followed by a number. The default value + is ``ns``, so dummy prefixes are named ``ns1``, ``ns2``, and so on. + :param converter: + If a pre-existing converter is passed, then URIs that can be parsed using the + pre-existing converter are not considered during discovery. + + For example, if you're an OBO person working with URIs coming from an OBO ontology, + it makes sense to pass the converter from :func:`curies.get_obo_converter` to + reduce false positive discoveries. More generally, a comprehensive converter + like the Bioregistry (from :func:`curies.get_bioregistry_converter`) can massively + reduce false positive discoveries and ultimately reduce burden on the data scientist + using this function when needing to understand the results and carefully curate a prefix + map based on the discoveries. + :returns: + A converter with dummy prefixes + + .. code-block:: python + + >>> import curies + + # Generate some example URIs + >>> uris = [f"http://ran.dom/{i:03}" for i in range(30)] + + >>> discovered_converter = curies.discover(uris) + >>> discovered_converter.records + [Record(prefix="ns1", uri_prefix="http://ran.dom/")] + + # Now, you can compress the URIs to dummy CURIEs + >>> discovered_converter.compress("http://ran.dom/002") + 'ns1:002' + """ + uri_prefix_to_luids = _get_uri_prefix_to_luids( + converter=converter, uris=uris, delimiters=delimiters + ) + uri_prefixes = [ + uri_prefix + for uri_prefix, luids in sorted(uri_prefix_to_luids.items()) + # If the cutoff is 5, and only 3 unique LUIDs with the URI prefix + # were identified, we're going to disregard this URI prefix. + if cutoff is None or len(luids) >= cutoff + ] + records = [ + Record(prefix=f"{metaprefix}{uri_prefix_index}", uri_prefix=uri_prefix) + for uri_prefix_index, uri_prefix in enumerate(uri_prefixes, start=1) + ] + return Converter(records) + + +#: The default delimiters used when guessing URI prefixes +DEFAULT_DELIMITERS = ("#", "/", "_") + + +def _get_uri_prefix_to_luids( + *, + converter: Optional[Converter] = None, + uris: Iterable[str], + delimiters: Optional[Sequence[str]] = None, +) -> Mapping[str, Set[str]]: + """Get a mapping from putative URI prefixes to corresponding putative local unique identifiers. + + :param converter: + A converter with pre-existing definitions. URI prefixes + are considered "new" if they can't already be validated by this converter + :param uris: + An iterable of URIs to search through. Will be taken as a set and + each unique entry is only considered once. + :param delimiters: + The delimiters considered between a putative URI prefix and putative + local unique identifier. By default, checks ``#`` first since this is + commonly used for URL fragments, then ``/`` since many URIs are constructed + with these. + :returns: + A dictionary of putative URI prefixes to sets of putative local unique identifiers + """ + if not delimiters: + delimiters = DEFAULT_DELIMITERS + uri_prefix_to_luids = defaultdict(set) + for uri in uris: + if converter is not None and converter.is_uri(uri): + continue + if uri.startswith("https://github.com") and "issues" in uri: + # TODO it's not really the job of :mod:`curies` to incorporate special cases, + # but the GitHub thing is such an annoyance... + continue + for delimiter in delimiters: + if delimiter not in uri: + continue + uri_prefix, luid = uri.rsplit(delimiter, maxsplit=1) + if luid.isalnum(): + uri_prefix_to_luids[uri_prefix + delimiter].add(luid) + break + return dict(uri_prefix_to_luids) diff --git a/tests/test_discovery.py b/tests/test_discovery.py new file mode 100644 index 0000000..fc7441c --- /dev/null +++ b/tests/test_discovery.py @@ -0,0 +1,82 @@ +"""Test discovering a prefix map from a list of URIs.""" + +import unittest +from typing import ClassVar + +import rdflib + +from curies import Converter, Record +from curies.discovery import discover, discover_from_rdf +from tests.constants import SLOW + + +class TestDiscovery(unittest.TestCase): + """Test discovery of URI prefixes.""" + + converter: ClassVar[Converter] + + @classmethod + def setUpClass(cls) -> None: + """Set up the test case with a dummy converter.""" + cls.converter = Converter( + [ + Record(prefix="GO", uri_prefix="http://purl.obolibrary.org/obo/GO_"), + Record(prefix="rdfs", uri_prefix=str(rdflib.RDFS._NS)), + ] + ) + + def test_simple(self): + """Test a simple case of discovering URI prefixes.""" + uris = [f"http://ran.dom/{i:03}" for i in range(30)] + uris.append("http://purl.obolibrary.org/obo/GO_0001234") + + converter = discover(uris, cutoff=3, converter=self.converter) + self.assertEqual([Record(prefix="ns1", uri_prefix="http://ran.dom/")], converter.records) + self.assertEqual("ns1:001", converter.compress("http://ran.dom/001")) + self.assertIsNone( + converter.compress("http://purl.obolibrary.org/obo/GO_0001234"), + msg="discovered converter should not inherit reference converter's definitions", + ) + + converter = discover(uris, cutoff=50, converter=self.converter) + self.assertEqual([], converter.records) + self.assertIsNone( + converter.compress("http://ran.dom/001"), + msg="cutoff was high, so discovered converter should not detect `http://ran.dom/`", + ) + + def test_rdflib(self): + """Test discovery in RDFlib.""" + graph = rdflib.Graph() + for i in range(30): + graph.add( + ( + rdflib.URIRef(f"http://ran.dom/{i:03}"), + rdflib.RDFS.subClassOf, + rdflib.URIRef(f"http://ran.dom/{i + 1:03}"), + ) + ) + graph.add( + ( + rdflib.URIRef(f"http://ran.dom/{i:03}"), + rdflib.RDFS.label, + rdflib.Literal(f"Node {i}"), + ) + ) + + converter = discover_from_rdf(graph, converter=self.converter) + self.assertEqual([Record(prefix="ns1", uri_prefix="http://ran.dom/")], converter.records) + self.assertEqual("ns1:001", converter.compress("http://ran.dom/001")) + self.assertIsNone( + converter.compress("http://purl.obolibrary.org/obo/GO_0001234"), + msg="discovered converter should not inherit reference converter's definitions", + ) + + @SLOW + def test_remote(self): + """Test parsing AEON.""" + converter = discover_from_rdf( + graph="https://raw.githubusercontent.com/tibonto/aeon/main/aeon.owl", + format="xml", + ) + self.assertIn("http://purl.obolibrary.org/obo/AEON_", converter.reverse_prefix_map)