From 5061a4291d2d0da95de4e4f6ef6b8113f7770f6c Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Tue, 14 Nov 2023 11:58:05 +0100
Subject: [PATCH] Add discovery algorithm (#93)

This PR adds a workflow that iterates through a list of URIs and tries
to discovery new URI prefixes
---
 docs/source/discovery.rst | 256 +++++++++++++++++++++++++++++++++++++
 docs/source/index.rst     |   1 +
 src/curies/__init__.py    |   4 +
 src/curies/discovery.py   | 257 ++++++++++++++++++++++++++++++++++++++
 tests/test_discovery.py   |  82 ++++++++++++
 5 files changed, 600 insertions(+)
 create mode 100644 docs/source/discovery.rst
 create mode 100644 src/curies/discovery.py
 create mode 100644 tests/test_discovery.py

diff --git a/docs/source/discovery.rst b/docs/source/discovery.rst
new file mode 100644
index 0000000..b85d6c9
--- /dev/null
+++ b/docs/source/discovery.rst
@@ -0,0 +1,256 @@
+URI Prefix Discovery
+====================
+.. automodule:: curies.discovery
+
+Discovering URI Prefixes from an Ontology
+-----------------------------------------
+A common place where discovering URI prefixes is important is when working with new ontologies.
+In the following example, we look at the `Academic Event Ontology (AEON) <https://bioregistry.io/aeon>`_. This
+is an ontology developed under OBO Foundry principles describing academic events. Accordingly, it includes many
+URI references to terms in OBO Foundry ontologies.
+
+In this tutorial, we use :func:`curies.discover` (and then :func:`curies.discover_from_rdf` as a nice convenience
+function) to load the ontology in the RDF/XML format and discover putative URI prefixes.
+
+.. code-block:: python
+
+    import curies
+    from curies.discovery import get_uris_from_rdf
+
+    ONTOLOGY_URL = "https://raw.githubusercontent.com/tibonto/aeon/main/aeon.owl"
+
+    uris = get_uris_from_rdf(ONTOLOGY_URL, format="xml")
+    discovered_converter = curies.discover(uris)
+    # note, these two steps can be combine with curies.discover_from_rdf,
+    # and we'll do that in the following examples
+
+We discovered the fifty URI prefixes in the following table. Many of them appear to be OBO Foundry URI prefixes or
+semantic web prefixes, so in the next step, we'll use prior knowledge to reduce the false discovery rate.
+
+==============  ==============================================================================
+curie_prefix    uri_prefix
+==============  ==============================================================================
+ns1             ``http://ontologydesignpatterns.org/wiki/Community:Parts_and_``
+ns2             ``http://protege.stanford.edu/plugins/owl/protege#``
+ns3             ``http://purl.obolibrary.org/obo/AEON_``
+ns4             ``http://purl.obolibrary.org/obo/APOLLO_SV_``
+ns5             ``http://purl.obolibrary.org/obo/BFO_``
+ns6             ``http://purl.obolibrary.org/obo/CRO_``
+ns7             ``http://purl.obolibrary.org/obo/ENVO_``
+ns8             ``http://purl.obolibrary.org/obo/IAO_``
+ns9             ``http://purl.obolibrary.org/obo/ICO_``
+ns10            ``http://purl.obolibrary.org/obo/NCBITaxon_``
+ns11            ``http://purl.obolibrary.org/obo/OBIB_``
+ns12            ``http://purl.obolibrary.org/obo/OBI_``
+ns13            ``http://purl.obolibrary.org/obo/OMO_``
+ns14            ``http://purl.obolibrary.org/obo/OOSTT_``
+ns15            ``http://purl.obolibrary.org/obo/RO_``
+ns16            ``http://purl.obolibrary.org/obo/TXPO_``
+ns17            ``http://purl.obolibrary.org/obo/bfo/axiom/``
+ns18            ``http://purl.obolibrary.org/obo/valid_for_``
+ns19            ``http://purl.obolibrary.org/obo/valid_for_go_``
+ns20            ``http://purl.obolibrary.org/obo/valid_for_go_annotation_``
+ns21            ``http://purl.obolibrary.org/obo/wikiCFP_``
+ns22            ``http://purl.org/dc/elements/1.1/``
+ns23            ``http://purl.org/dc/terms/``
+ns24            ``http://usefulinc.com/ns/doap#``
+ns25            ``http://wiki.geneontology.org/index.php/Involved_``
+ns26            ``http://www.geneontology.org/formats/oboInOwl#``
+ns27            ``http://www.geneontology.org/formats/oboInOwl#created_``
+ns28            ``http://www.w3.org/1999/02/22-rdf-syntax-ns#``
+ns29            ``http://www.w3.org/2000/01/rdf-schema#``
+ns30            ``http://www.w3.org/2001/XMLSchema#``
+ns31            ``http://www.w3.org/2002/07/owl#``
+ns32            ``http://www.w3.org/2003/11/swrl#``
+ns33            ``http://www.w3.org/2004/02/skos/core#``
+ns34            ``http://www.w3.org/ns/prov#``
+ns35            ``http://xmlns.com/foaf/0.1/``
+ns36            ``https://en.wikipedia.org/wiki/Allen%27s_interval_``
+ns37            ``https://groups.google.com/d/msg/bfo-owl-devel/s9Uug5QmAws/ZDRnpiIi_``
+ns38            ``https://ror.org/``
+ns39            ``https://w3id.org/scholarlydata/ontology/conference-ontology.owl#``
+ns40            ``https://w3id.org/seo#``
+ns41            ``https://www.confident-conference.org/index.php/Academic_Field:Information_``
+ns42            ``https://www.confident-conference.org/index.php/Event:VIVO_``
+ns43            ``https://www.confident-conference.org/index.php/Event:VIVO_2021_``
+ns44            ``https://www.confident-conference.org/index.php/Event:VIVO_2021_orga_``
+ns45            ``https://www.confident-conference.org/index.php/Event:VIVO_2021_talk1_``
+ns46            ``https://www.confident-conference.org/index.php/Event:VIVO_2021_talk2_``
+ns47            ``https://www.confident-conference.org/index.php/Event_Series:VIVO_``
+ns48            ``https://www.wikidata.org/wiki/``
+ns49            ``https://www.wikidata.org/wiki/Wikidata:Property_proposal/colocated_``
+ns50            ``urn:swrl#``
+==============  ==============================================================================
+
+In the following block, we chain together (extended) prefix maps from the OBO Foundry as well as
+a "semantic web" prefix map to try and reduce the number of false positives by passing them
+through the ``converter`` keyword argument.
+
+.. code-block:: python
+
+    import curies
+
+    ONTOLOGY_URL = "https://raw.githubusercontent.com/tibonto/aeon/main/aeon.owl"
+    SEMWEB_URL = "https://raw.githubusercontent.com/biopragmatics/bioregistry/main/exports/contexts/semweb.context.jsonld"
+
+    base_converter = curies.chain([
+        curies.load_jsonld_context(SEMWEB_URL),
+        curies.get_obo_converter(),
+    ])
+
+    discovered_converter = curies.discover_from_rdf(
+        ONTOLOGY_URL, format="xml", converter=base_converter
+    )
+
+We reduced the number of putative URI prefixes in half in the following table. However, we can still identify
+some putative URI prefixes that likely would have appeared in a more comprehensive (extended) prefix map such
+as the Bioregistry such as:
+
+- ``https://ror.org/`` for the `Research Organization Registry (ROR) <https://bioregistry.io/ror>`_
+- ``https://w3id.org/seo#`` for the `Scientific Event Ontology (SEO) <https://bioregistry.io/seo>`_
+- ``http://usefulinc.com/ns/doap#`` for the `Description of a Project (DOAP) vocabulary <https://bioregistry.io/doap>`_
+
+Despite this, we're on our way! It's also obvious that several of the remaining putative URI prefixes come from
+non-standard usage of the OBO PURL system (e.g., ``http://purl.obolibrary.org/obo/valid_for_go_annotation_``)
+and some are proper false positives due to using ``_`` as a delimiter
+(e.g., ``https://www.confident-conference.org/index.php/Event:VIVO_2021_talk2_``).
+
+==============  ==============================================================================
+curie_prefix    uri_prefix
+==============  ==============================================================================
+ns1             ``http://ontologydesignpatterns.org/wiki/Community:Parts_and_``
+ns2             ``http://protege.stanford.edu/plugins/owl/protege#``
+ns3             ``http://purl.obolibrary.org/obo/AEON_``
+ns4             ``http://purl.obolibrary.org/obo/bfo/axiom/``
+ns5             ``http://purl.obolibrary.org/obo/valid_for_``
+ns6             ``http://purl.obolibrary.org/obo/valid_for_go_``
+ns7             ``http://purl.obolibrary.org/obo/valid_for_go_annotation_``
+ns8             ``http://purl.obolibrary.org/obo/wikiCFP_``
+ns9             ``http://usefulinc.com/ns/doap#``
+ns10            ``http://wiki.geneontology.org/index.php/Involved_``
+ns11            ``https://en.wikipedia.org/wiki/Allen%27s_interval_``
+ns12            ``https://groups.google.com/d/msg/bfo-owl-devel/s9Uug5QmAws/ZDRnpiIi_``
+ns13            ``https://ror.org/``
+ns14            ``https://w3id.org/scholarlydata/ontology/conference-ontology.owl#``
+ns15            ``https://w3id.org/seo#``
+ns16            ``https://www.confident-conference.org/index.php/Academic_Field:Information_``
+ns17            ``https://www.confident-conference.org/index.php/Event:VIVO_``
+ns18            ``https://www.confident-conference.org/index.php/Event:VIVO_2021_``
+ns19            ``https://www.confident-conference.org/index.php/Event:VIVO_2021_orga_``
+ns20            ``https://www.confident-conference.org/index.php/Event:VIVO_2021_talk1_``
+ns21            ``https://www.confident-conference.org/index.php/Event:VIVO_2021_talk2_``
+ns22            ``https://www.confident-conference.org/index.php/Event_Series:VIVO_``
+ns23            ``https://www.wikidata.org/wiki/``
+ns24            ``https://www.wikidata.org/wiki/Wikidata:Property_proposal/colocated_``
+ns25            ``urn:swrl#``
+==============  ==============================================================================
+
+As a final step in our iterative journey of URI prefix discovery, we're going to use a cutoff for a minimum
+of two appearances of a URI prefix to reduce the most spurious false positives.
+
+.. code-block:: python
+
+    import curies
+
+    ONTOLOGY_URL = "https://raw.githubusercontent.com/tibonto/aeon/main/aeon.owl"
+    SEMWEB_URL = "https://raw.githubusercontent.com/biopragmatics/bioregistry/main/exports/contexts/semweb.context.jsonld"
+
+    base_converter = curies.chain([
+        curies.load_jsonld_context(SEMWEB_URL),
+        curies.get_obo_converter(),
+    ])
+
+    discovered_converter = curies.discover_from_rdf(
+        ONTOLOGY_URL, format="xml", converter=base_converter, cutoff=2
+    )
+
+We have reduced the list to a manageable set of 9 putative URI prefixes in the following table.
+
+==============  =========================================================================
+curie_prefix    uri_prefix
+==============  =========================================================================
+ns1             ``http://purl.obolibrary.org/obo/AEON_``
+ns2             ``http://purl.obolibrary.org/obo/bfo/axiom/``
+ns3             ``http://purl.obolibrary.org/obo/valid_for_go_``
+ns4             ``https://w3id.org/scholarlydata/ontology/conference-ontology.owl#``
+ns5             ``https://w3id.org/seo#``
+ns6             ``https://www.confident-conference.org/index.php/Event:VIVO_2021_``
+ns7             ``https://www.confident-conference.org/index.php/Event:VIVO_2021_talk1_``
+ns8             ``https://www.confident-conference.org/index.php/Event:VIVO_2021_talk2_``
+ns9             ``urn:swrl#``
+==============  =========================================================================
+
+Here are the calls to be made:
+
+- ``ns1`` represents the AEON vocabulary itself and should be given the ``aeon`` prefix.
+- ``ns2`` and ``ns3``` are all false positives
+- ``ns6``, ``ns7``, and ``ns8`` are a tricky case - they have a meaningful overlap that can't be easily
+  automatically detected (yet). In this case, it makes the most sense to add the shortest one manually
+  to the base converter with some unique name (don't use ``ns6`` as it will cause conflicts later), like in:
+
+  .. code-block:: python
+
+      base_converter = curies.chain([
+          curies.load_jsonld_context(SEMWEB_URL),
+          curies.get_obo_converter(),
+          curies.load_prefix_map({"confident_event_vivo_2021": "https://www.confident-conference.org/index.php/Event:VIVO_2021_"}),
+      ])
+
+  In reality, these are all part of the `ConfIDent Event <https://bioregistry.io/confident.event>`_ vocabulary,
+  which has the URI prefix ``https://www.confident-conference.org/index.php/Event:``.
+- ``ns4`` represents the `Conference Ontology <https://bioregistry.io/conference>`_ and
+  should be given the ``conference`` prefix.
+- ``ns5`` represents the `Scientific Event Ontology (SEO) <https://bioregistry.io/seo>`_ and
+  should be given the ``seo`` prefix.
+- ``ns9`` represents the `Semantic Web Rule Language <https://bioregistry.io/registry/swrl>`_,
+  though using URNs is an interesting choice in serialization.
+
+After we've made these calls, it's a good idea to write an (extended) prefix map. In this case, since we aren't working
+with CURIE prefix synonyms nor URI prefix synonyms, it's okay to write a simple prefix map or a JSON-LD context without
+losing information.
+
+.. note::
+
+    Postscript: throughout this guide, we used the following Python code to create the RST tables:
+
+   .. code-block:: python
+
+      def print_converter(converter) -> None:
+          from tabulate import tabulate
+          rows = sorted(
+              [
+                  (record.prefix, f"``{record.uri_prefix}``")
+                  for record in discovered_converter.records
+              ],
+              key=lambda t: int(t[0].removeprefix("ns")),
+          )
+          print(tabulate(rows, headers=["curie_prefix", "uri_prefix"], tablefmt="rst"))
+
+Just Make It Work, or, A Guide to Being a Questionable Semantic Citizen
+-----------------------------------------------------------------------
+The goal of the :mod:`curies` package is to provide the tools towards making semantically well-defined data,
+which has a meaningful (extended) prefix map associated with it. Maybe you're in an organization that doesn't really
+care about the utility of nice prefix maps, and just wants to get the job done where you need to turn URIs into _some_
+CURIEs.
+
+Here's a recipe for doing this, based on the last example with AEON:
+
+.. code-block:: python
+
+    import curies
+
+    ONTOLOGY_URL = "https://raw.githubusercontent.com/tibonto/aeon/main/aeon.owl"
+
+    # Use the Bioregistry as a base prefix since it's the most comprehensive one
+    base_converter = curies.get_bioregistry_converter()
+
+    # Only discover what the Bioregistry doesn't already have
+    discovered_converter = curies.discover_from_rdf(
+        ONTOLOGY_URL, format="xml", converter=base_converter
+    )
+
+    # Chain together the base converter with the discoveries
+    augmented_converter = curies.chain([base_converter, discovered_converter])
+
+With the augmented converter, you can now convert all URIs in the ontology into CURIEs. They will have a smattering
+of unintelligible prefixes with no meaning, but at least the job is done!
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 76082d7..8bbe6ba 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -63,6 +63,7 @@ for updating your code.
 
    tutorial
    reconciliation
+   discovery
    struct
    api
    services/index
diff --git a/src/curies/__init__.py b/src/curies/__init__.py
index 4ecb149..350a221 100644
--- a/src/curies/__init__.py
+++ b/src/curies/__init__.py
@@ -19,6 +19,7 @@
     write_jsonld_context,
     write_shacl,
 )
+from .discovery import discover, discover_from_rdf
 from .reconciliation import remap_curie_prefixes, remap_uri_prefixes, rewire
 from .sources import (
     get_bioregistry_converter,
@@ -56,4 +57,7 @@
     "get_monarch_converter",
     "get_go_converter",
     "get_bioregistry_converter",
+    # discovery
+    "discover",
+    "discover_from_rdf",
 ]
diff --git a/src/curies/discovery.py b/src/curies/discovery.py
new file mode 100644
index 0000000..cf8919e
--- /dev/null
+++ b/src/curies/discovery.py
@@ -0,0 +1,257 @@
+"""Discovery new entries for a Converter.
+
+The :func:`curies.discover` functionality is intended to be used in a "data science" workflow. Its goal is to
+enable a data scientist to semi-interactively explore data (e.g., coming from an ontology, SSSOM, RDF)
+that doesn't come with a complete (extended) prefix map and identify common URI prefixes.
+
+It returns the discovered URI prefixes in a :class:`curies.Converter` object with "dummy" CURIE prefixes.
+This makes it possible to convert the URIs appearing in the data into CURIEs and therefore enables
+their usage in places where CURIEs are expected.
+
+However, it's suggested that after discovering URI prefixes, the data scientist more carefully
+constructs a meaningful prefix map based on the discovered one. This might include some or all of the
+following steps:
+
+1. Replace dummy CURIE prefixes with meaningful ones
+2. Remove spurious URI prefixes that appear but do not represent a semantic space. This happens often
+   due to using ``_`` as a delimiter or having a frequency cutoff of zero (see the parameters for this function).
+3. Consider chaining a comprehensive extended prefix map such as the Bioregistry
+   (from :func:`curies.get_bioregistry_converter`) with onto the converter passed to this function
+   so pre-existing URI prefixes are not *re-discovered*.
+
+Finally, you should save the prefix map that you create in a persistent place (i.e., inside a JSON file) such
+that it can be reused.
+
+Algorithm
+---------
+The :func:`curies.discover` function implements the following algorithm that does the following for each URI:
+
+1. For each delimiter (in the priority order they are given) check if the delimiter is present.
+2. If it's present, split the URI into two parts based on rightmost appearance of the delimiter.
+3. If the right part after splitting is all alphanumeric characters, save the URI prefix (with delimiter attached)
+4. If a delimiter is successfully used to identify a URI prefix, don't check any of the following delimiters
+
+After identifying putative URI prefixes, the second part of the algorithm does the following:
+
+1. If a cutoff was provided, remove all putative URI prefixes for which there were fewer examples than the cutoff
+2. Sort the URI prefixes lexicographically (i.e., with :func:`sorted`)
+3. Assign a dummy CURIE prefix to each URI prefix, counting upwards from 1
+4. Construct a converter from this prefix map and return it
+"""
+
+from collections import defaultdict
+from pathlib import PurePath
+from typing import IO, TYPE_CHECKING, Any, Iterable, Mapping, Optional, Sequence, Set, TextIO, Union
+
+from typing_extensions import Literal
+
+from curies import Converter, Record
+
+if TYPE_CHECKING:
+    import rdflib
+
+__all__ = [
+    "discover",
+    "discover_from_rdf",
+]
+
+
+GraphFormats = Literal["turtle", "xml", "n3", "nt", "trix"]
+GraphInput = Union[IO[bytes], TextIO, "rdflib.parser.InputSource", str, bytes, PurePath]
+
+
+def discover_from_rdf(
+    graph: Union[GraphInput, "rdflib.Graph"],
+    *,
+    format: Optional[GraphFormats] = None,
+    **kwargs: Any,
+) -> Converter:
+    """Discover new URI prefixes from RDF content via :mod:`rdflib`.
+
+    This function works the same as :func:`discover`, but gets its URI list from
+    a triple store. See :func:`discover` for a more detailed explanation of how this
+    algorithm works.
+
+    :param graph:
+        Either a pre-instantiated RDFlib graph, or an input type to the ``source``
+        keyword of :meth:`rdflib.Graph.parse`. This can be one of the following:
+
+        - A string or bytes representation of a URL
+        - A string, bytes, or Path representation of a local file
+        - An I/O object that can be read directly
+        - An open XML reader from RDFlib (:class:`rdflib.parser.InputSource`)
+    :param format: If ``graph`` is given as a URL or I/O object, this
+        is passed through to the ``format`` keyword of :meth:`rdflib.Graph.parse`.
+        If none is given, defaults to ``turtle``.
+    :param kwargs: Keyword arguments passed through to :func:`discover`
+    :returns:
+        A converter with dummy prefixes for URI prefixes appearing in the RDF
+        content (i.e., triples).
+    """
+    uris = get_uris_from_rdf(graph=graph, format=format)
+    return discover(uris, **kwargs)
+
+
+def get_uris_from_rdf(
+    graph: Union[GraphInput, "rdflib.Graph"], *, format: Optional[GraphFormats] = None
+) -> Set[str]:
+    """Get a set of URIs from a graph."""
+    graph = _ensure_graph(graph=graph, format=format)
+    return set(_yield_uris(graph=graph))
+
+
+def _ensure_graph(
+    *, graph: Union[GraphInput, "rdflib.Graph"], format: Optional[GraphFormats] = None
+) -> "rdflib.Graph":
+    import rdflib
+
+    if not isinstance(graph, rdflib.Graph):
+        _temp_graph = rdflib.Graph()
+        _temp_graph.parse(source=graph, format=format)
+        graph = _temp_graph
+
+    return graph
+
+
+def _yield_uris(*, graph: "rdflib.Graph") -> Iterable[str]:
+    import rdflib
+
+    for parts in graph.triples((None, None, None)):
+        for part in parts:
+            if isinstance(part, rdflib.URIRef):
+                yield str(part)
+
+
+def discover(
+    uris: Iterable[str],
+    *,
+    delimiters: Optional[Sequence[str]] = None,
+    cutoff: Optional[int] = None,
+    metaprefix: str = "ns",
+    converter: Optional[Converter] = None,
+) -> Converter:
+    """Discover new URI prefixes and construct a converter with a unique dummy CURIE prefix for each.
+
+    :param uris:
+        An iterable of URIs to search through. Will be taken as a set and
+        each unique entry is only considered once.
+    :param delimiters:
+        The character(s) that delimit a URI prefix from a local unique identifier.
+        If none given, defaults to using ``/``, ``#``, and ``_``. For example:
+
+        - ``/`` is the delimiter in ``https://www.ncbi.nlm.nih.gov/pubmed/37929212``, which separates
+          the URI prefix ``https://www.ncbi.nlm.nih.gov/pubmed/`` from the local unique identifier
+          `37929212 <https://www.ncbi.nlm.nih.gov/pubmed/37929212>`_ for the article
+          "New insights into osmobiosis and chemobiosis in tardigrades" in PubMed.
+        - ``#`` is the delimiter in ``http://www.w3.org/2000/01/rdf-schema#label``, which separates
+          the URI prefix ``http://www.w3.org/2000/01/rdf-schema#`` from the local unique identifier
+          `label <http://www.w3.org/2000/01/rdf-schema#label>`_ for the term "label" in the RDF Schema.
+          The ``#`` typically is used in a URL to denote a fragment and commonly appears in small semantic
+          web vocabularies that are shown as a single HTML page.
+        - ``_`` is the delimiter in ``http://purl.obolibrary.org/obo/GO_0032571``, which separates
+          the URI prefix ``http://purl.obolibrary.org/obo/GO_`` from the local unique identifier
+          `0032571 <http://purl.obolibrary.org/obo/GO_0032571>`_ for the term "response to vitamin K"
+          in the Gene Ontology
+
+        .. note:: The delimiter is itself a part of the URI prefix
+    :param cutoff:
+        If given, will require more than ``cutoff`` unique local unique identifiers
+        associated with a given URI prefix to keep it.
+
+        Defaults to zero, which increases recall (i.e., likelihood of getting all
+        possible URI prefixes) but decreases precision (i.e., more of the results
+        might be false positives / spurious). If you get a lot of false positives,
+        try increasing first to 1, 2, then maybe higher.
+    :param metaprefix:
+        The beginning part of each dummy prefix, followed by a number. The default value
+        is ``ns``, so dummy prefixes are named ``ns1``, ``ns2``, and so on.
+    :param converter:
+        If a pre-existing converter is passed, then URIs that can be parsed using the
+        pre-existing converter are not considered during discovery.
+
+        For example, if you're an OBO person working with URIs coming from an OBO ontology,
+        it makes sense to pass the converter from :func:`curies.get_obo_converter` to
+        reduce false positive discoveries. More generally, a comprehensive converter
+        like the Bioregistry (from :func:`curies.get_bioregistry_converter`) can massively
+        reduce false positive discoveries and ultimately reduce burden on the data scientist
+        using this function when needing to understand the results and carefully curate a prefix
+        map based on the discoveries.
+    :returns:
+        A converter with dummy prefixes
+
+    .. code-block:: python
+
+        >>> import curies
+
+        # Generate some example URIs
+        >>> uris = [f"http://ran.dom/{i:03}" for i in range(30)]
+
+        >>> discovered_converter = curies.discover(uris)
+        >>> discovered_converter.records
+        [Record(prefix="ns1", uri_prefix="http://ran.dom/")]
+
+        # Now, you can compress the URIs to dummy CURIEs
+        >>> discovered_converter.compress("http://ran.dom/002")
+        'ns1:002'
+    """
+    uri_prefix_to_luids = _get_uri_prefix_to_luids(
+        converter=converter, uris=uris, delimiters=delimiters
+    )
+    uri_prefixes = [
+        uri_prefix
+        for uri_prefix, luids in sorted(uri_prefix_to_luids.items())
+        # If the cutoff is 5, and only 3 unique LUIDs with the URI prefix
+        # were identified, we're going to disregard this URI prefix.
+        if cutoff is None or len(luids) >= cutoff
+    ]
+    records = [
+        Record(prefix=f"{metaprefix}{uri_prefix_index}", uri_prefix=uri_prefix)
+        for uri_prefix_index, uri_prefix in enumerate(uri_prefixes, start=1)
+    ]
+    return Converter(records)
+
+
+#: The default delimiters used when guessing URI prefixes
+DEFAULT_DELIMITERS = ("#", "/", "_")
+
+
+def _get_uri_prefix_to_luids(
+    *,
+    converter: Optional[Converter] = None,
+    uris: Iterable[str],
+    delimiters: Optional[Sequence[str]] = None,
+) -> Mapping[str, Set[str]]:
+    """Get a mapping from putative URI prefixes to corresponding putative local unique identifiers.
+
+    :param converter:
+        A converter with pre-existing definitions. URI prefixes
+        are considered "new" if they can't already be validated by this converter
+    :param uris:
+        An iterable of URIs to search through. Will be taken as a set and
+        each unique entry is only considered once.
+    :param delimiters:
+        The delimiters considered between a putative URI prefix and putative
+        local unique identifier. By default, checks ``#`` first since this is
+        commonly used for URL fragments, then ``/`` since many URIs are constructed
+        with these.
+    :returns:
+        A dictionary of putative URI prefixes to sets of putative local unique identifiers
+    """
+    if not delimiters:
+        delimiters = DEFAULT_DELIMITERS
+    uri_prefix_to_luids = defaultdict(set)
+    for uri in uris:
+        if converter is not None and converter.is_uri(uri):
+            continue
+        if uri.startswith("https://github.com") and "issues" in uri:
+            # TODO it's not really the job of :mod:`curies` to incorporate special cases,
+            #  but the GitHub thing is such an annoyance...
+            continue
+        for delimiter in delimiters:
+            if delimiter not in uri:
+                continue
+            uri_prefix, luid = uri.rsplit(delimiter, maxsplit=1)
+            if luid.isalnum():
+                uri_prefix_to_luids[uri_prefix + delimiter].add(luid)
+                break
+    return dict(uri_prefix_to_luids)
diff --git a/tests/test_discovery.py b/tests/test_discovery.py
new file mode 100644
index 0000000..fc7441c
--- /dev/null
+++ b/tests/test_discovery.py
@@ -0,0 +1,82 @@
+"""Test discovering a prefix map from a list of URIs."""
+
+import unittest
+from typing import ClassVar
+
+import rdflib
+
+from curies import Converter, Record
+from curies.discovery import discover, discover_from_rdf
+from tests.constants import SLOW
+
+
+class TestDiscovery(unittest.TestCase):
+    """Test discovery of URI prefixes."""
+
+    converter: ClassVar[Converter]
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        """Set up the test case with a dummy converter."""
+        cls.converter = Converter(
+            [
+                Record(prefix="GO", uri_prefix="http://purl.obolibrary.org/obo/GO_"),
+                Record(prefix="rdfs", uri_prefix=str(rdflib.RDFS._NS)),
+            ]
+        )
+
+    def test_simple(self):
+        """Test a simple case of discovering URI prefixes."""
+        uris = [f"http://ran.dom/{i:03}" for i in range(30)]
+        uris.append("http://purl.obolibrary.org/obo/GO_0001234")
+
+        converter = discover(uris, cutoff=3, converter=self.converter)
+        self.assertEqual([Record(prefix="ns1", uri_prefix="http://ran.dom/")], converter.records)
+        self.assertEqual("ns1:001", converter.compress("http://ran.dom/001"))
+        self.assertIsNone(
+            converter.compress("http://purl.obolibrary.org/obo/GO_0001234"),
+            msg="discovered converter should not inherit reference converter's definitions",
+        )
+
+        converter = discover(uris, cutoff=50, converter=self.converter)
+        self.assertEqual([], converter.records)
+        self.assertIsNone(
+            converter.compress("http://ran.dom/001"),
+            msg="cutoff was high, so discovered converter should not detect `http://ran.dom/`",
+        )
+
+    def test_rdflib(self):
+        """Test discovery in RDFlib."""
+        graph = rdflib.Graph()
+        for i in range(30):
+            graph.add(
+                (
+                    rdflib.URIRef(f"http://ran.dom/{i:03}"),
+                    rdflib.RDFS.subClassOf,
+                    rdflib.URIRef(f"http://ran.dom/{i + 1:03}"),
+                )
+            )
+            graph.add(
+                (
+                    rdflib.URIRef(f"http://ran.dom/{i:03}"),
+                    rdflib.RDFS.label,
+                    rdflib.Literal(f"Node {i}"),
+                )
+            )
+
+        converter = discover_from_rdf(graph, converter=self.converter)
+        self.assertEqual([Record(prefix="ns1", uri_prefix="http://ran.dom/")], converter.records)
+        self.assertEqual("ns1:001", converter.compress("http://ran.dom/001"))
+        self.assertIsNone(
+            converter.compress("http://purl.obolibrary.org/obo/GO_0001234"),
+            msg="discovered converter should not inherit reference converter's definitions",
+        )
+
+    @SLOW
+    def test_remote(self):
+        """Test parsing AEON."""
+        converter = discover_from_rdf(
+            graph="https://raw.githubusercontent.com/tibonto/aeon/main/aeon.owl",
+            format="xml",
+        )
+        self.assertIn("http://purl.obolibrary.org/obo/AEON_", converter.reverse_prefix_map)