From 470f71a69264c17260823d485de113695076a38f Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Mon, 22 Apr 2024 14:27:19 +0200
Subject: [PATCH] Add JSON schema for extended prefix map (#109)

---
 MANIFEST.in            |  1 +
 docs/make_schema.py    | 54 +++++++++++++++++++++++++++++++++++
 docs/schema.json       | 64 ++++++++++++++++++++++++++++++++++++++++++
 docs/source/struct.rst |  6 +++-
 src/curies/__init__.py |  2 ++
 src/curies/api.py      | 63 ++++++++++++++++++++++++++++++-----------
 tests/test_api.py      | 11 ++++++++
 7 files changed, 184 insertions(+), 17 deletions(-)
 create mode 100644 docs/make_schema.py
 create mode 100644 docs/schema.json

diff --git a/MANIFEST.in b/MANIFEST.in
index 6c3ce2a..265cc66 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -16,3 +16,4 @@ global-exclude *.py[cod] __pycache__ *.so *.dylib .DS_Store *.gpickle
 
 include README.md LICENSE
 exclude tox.ini .flake8 .bumpversion.cfg .readthedocs.yml codecov.yml
+exclude docs/make_schema.py docs/schema.json
diff --git a/docs/make_schema.py b/docs/make_schema.py
new file mode 100644
index 0000000..63b0f58
--- /dev/null
+++ b/docs/make_schema.py
@@ -0,0 +1,54 @@
+"""Generate a JSON schema for extended prefix maps."""
+
+import json
+from pathlib import Path
+
+from curies import Records
+from curies._pydantic_compat import PYDANTIC_V1
+
+HERE = Path(__file__).parent.resolve()
+PATH = HERE.joinpath("schema.json")
+TITLE = "Extended Prefix Map"
+DESCRIPTION = (
+    """\
+An extended prefix map is a generalization of a prefix map that
+includes synonyms for URI prefixes and CURIE prefixes.
+""".strip()
+    .replace("\n", " ")
+    .replace("  ", " ")
+)
+URL = "https://w3id.org/biopragmatics/schema/epm.json"
+
+
+def main() -> None:
+    """Generate a JSON schema for extended prefix maps."""
+    rv = {
+        "$schema": "http://json-schema.org/draft-07/schema#",
+        "$id": URL,
+    }
+
+    if PYDANTIC_V1:
+        import pydantic.schema
+
+        # see https://docs.pydantic.dev/latest/usage/json_schema/#general-notes-on-json-schema-generation
+
+        schema_dict = pydantic.schema.schema(
+            [Records],
+            title=TITLE,
+            description=DESCRIPTION,
+        )
+    else:
+        from pydantic.json_schema import models_json_schema
+
+        _, schema_dict = models_json_schema(
+            [(Records, "validation")],
+            title=TITLE,
+            description=DESCRIPTION,
+        )
+
+    rv.update(schema_dict)
+    PATH.write_text(json.dumps(rv, indent=2) + "\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/schema.json b/docs/schema.json
new file mode 100644
index 0000000..56e9b51
--- /dev/null
+++ b/docs/schema.json
@@ -0,0 +1,64 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://w3id.org/biopragmatics/schema/epm.json",
+  "$defs": {
+    "Record": {
+      "description": "A record of some prefixes and their associated URI prefixes.\n\n.. seealso:: https://github.com/cthoyt/curies/issues/70",
+      "properties": {
+        "prefix": {
+          "description": "The canonical CURIE prefix, used in the reverse prefix map",
+          "title": "CURIE prefix",
+          "type": "string"
+        },
+        "uri_prefix": {
+          "description": "The canonical URI prefix, used in the forward prefix map",
+          "title": "URI prefix",
+          "type": "string"
+        },
+        "prefix_synonyms": {
+          "items": {
+            "type": "string"
+          },
+          "title": "CURIE prefix synonyms",
+          "type": "array"
+        },
+        "uri_prefix_synonyms": {
+          "items": {
+            "type": "string"
+          },
+          "title": "URI prefix synonyms",
+          "type": "array"
+        },
+        "pattern": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "The regular expression pattern for entries in this semantic space. Warning: this is an experimental feature.",
+          "title": "Pattern"
+        }
+      },
+      "required": [
+        "prefix",
+        "uri_prefix"
+      ],
+      "title": "Record",
+      "type": "object"
+    },
+    "Records": {
+      "description": "A list of records.",
+      "items": {
+        "$ref": "#/$defs/Record"
+      },
+      "title": "Records",
+      "type": "array"
+    }
+  },
+  "title": "Extended Prefix Map",
+  "description": "An extended prefix map is a generalization of a prefix map that includes synonyms for URI prefixes and CURIE prefixes."
+}
diff --git a/docs/source/struct.rst b/docs/source/struct.rst
index 92edece..8814e00 100644
--- a/docs/source/struct.rst
+++ b/docs/source/struct.rst
@@ -108,7 +108,8 @@ containing an entry for ChEBI) looks like:
        }
    ]
 
-An EPM is simply a list of records (see :class:`curies.Record`). EPMs have the benefit that they are still
+An EPM is simply a list of records (see :class:`curies.Record` and :class:`curies.Records`).
+EPMs have the benefit that they are still
 encoded in JSON and can easily be encoded in YAML, TOML, RDF, and other schemata. Further, prefix maps can be
 automatically upgraded into EPMs (with some caveats) using :func:`curies.upgrade_prefix_map`.
 
@@ -118,3 +119,6 @@ automatically upgraded into EPMs (with some caveats) using :func:`curies.upgrade
     can be loaded using :meth:`curies.Converter.from_extended_prefix_map`.
     We provide a Pydantic model representing it. Later, we hope to have an external, stable definition
     of this data schema.
+
+A JSON schema for EPMs is available at https://w3id.org/biopragmatics/schema/epm.json.
+It can be updated at https://github.com/biopragmatics/curies/tree/main/docs/make_schema.py.
diff --git a/src/curies/__init__.py b/src/curies/__init__.py
index 14697ec..25e3c63 100644
--- a/src/curies/__init__.py
+++ b/src/curies/__init__.py
@@ -8,6 +8,7 @@
     DuplicateURIPrefixes,
     DuplicateValueError,
     Record,
+    Records,
     Reference,
     ReferenceTuple,
     chain,
@@ -35,6 +36,7 @@
 __all__ = [
     "Converter",
     "Record",
+    "Records",
     "ReferenceTuple",
     "Reference",
     "DuplicateValueError",
diff --git a/src/curies/api.py b/src/curies/api.py
index 053f6e1..cc0acef 100644
--- a/src/curies/api.py
+++ b/src/curies/api.py
@@ -50,6 +50,7 @@
     "Reference",
     "ReferenceTuple",
     "Record",
+    "Records",
     "DuplicateValueError",
     "DuplicatePrefixes",
     "DuplicateURIPrefixes",
@@ -252,26 +253,21 @@ def from_curie(cls, curie: str, sep: str = ":") -> "Reference":
 class Record(BaseModel):  # type:ignore
     """A record of some prefixes and their associated URI prefixes.
 
-    A list of records can be annotated in a FastAPI setting with the following:
-
-    .. code-block:: python
-
-        from typing import List
-        from curies import Record
-        from pydantic import BaseModel
-
-        class Records(BaseModel):
-            __root__ = List[Record]
-
     .. seealso:: https://github.com/cthoyt/curies/issues/70
     """
 
-    prefix: str = Field(..., description="The canonical prefix, used in the reverse prefix map")
+    prefix: str = Field(
+        ...,
+        title="CURIE prefix",
+        description="The canonical CURIE prefix, used in the reverse prefix map",
+    )
     uri_prefix: str = Field(
-        ..., description="The canonical URI prefix, used in the forward prefix map"
+        ...,
+        title="URI prefix",
+        description="The canonical URI prefix, used in the forward prefix map",
     )
-    prefix_synonyms: List[str] = Field(default_factory=list)
-    uri_prefix_synonyms: List[str] = Field(default_factory=list)
+    prefix_synonyms: List[str] = Field(default_factory=list, title="CURIE prefix synonyms")
+    uri_prefix_synonyms: List[str] = Field(default_factory=list, title="URI prefix synonyms")
     pattern: Optional[str] = Field(
         default=None,
         description="The regular expression pattern for entries in this semantic space. "
@@ -315,6 +311,40 @@ def _key(self) -> RecordKey:
         )
 
 
+if PYDANTIC_V1:
+    # An explanation of RootModels in Pydantic V1 can be found on
+    # https://docs.pydantic.dev/1.10/usage/models/#custom-root-types
+
+    from pydantic import BaseModel
+
+    class Records(BaseModel):  # type:ignore
+        """A list of records."""
+
+        class Config:
+            """Configuration for the records."""
+
+            arbitrary_types_allowed = True
+
+        __root__: List[Record]
+
+        def __iter__(self) -> Iterable[Record]:
+            """Iterate over records."""
+            return cast(Iterable[Record], iter(self.__root__))
+
+else:
+    # An explanation of RootModels in Pydantic V2 can be found on
+    # https://docs.pydantic.dev/latest/concepts/models/#rootmodel-and-custom-root-types
+
+    from pydantic import RootModel
+
+    class Records(RootModel[List[Record]]):  # type:ignore
+        """A list of records."""
+
+        def __iter__(self) -> Iterable[Record]:
+            """Iterate over records."""
+            return cast(Iterable[Record], iter(self.root))
+
+
 class DuplicateSummary(NamedTuple):
     """A triple representing two records that are duplicated, either based on a CURIE or URI prefix."""
 
@@ -548,7 +578,8 @@ def add_record(self, record: Record, case_sensitive: bool = True, merge: bool =
         """Append a record to the converter."""
         matched = self._match_record(record, case_sensitive=case_sensitive)
         if len(matched) > 1:
-            raise ValueError(f"new record has duplicates: {matched}")
+            msg = "".join(f"\n  {m} -> {v}" for m, v in matched.items())
+            raise ValueError(f"new record has duplicates:{msg}")
         if len(matched) == 1:
             if not merge:
                 raise ValueError(f"new record already exists and merge=False: {matched}")
diff --git a/tests/test_api.py b/tests/test_api.py
index b404f4e..ba3fc14 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -21,6 +21,7 @@
     ExpansionError,
     PrefixStandardizationError,
     Record,
+    Records,
     Reference,
     ReferenceTuple,
     URIStandardizationError,
@@ -41,6 +42,16 @@
 GO_URI_PREFIX = "http://purl.obolibrary.org/obo/GO_"
 
 
+class TestStruct(unittest.TestCase):
+    """Test the data structures."""
+
+    def test_records(self):
+        """Test a list of records."""
+        records = Records.parse_obj([{"prefix": "chebi", "uri_prefix": CHEBI_URI_PREFIX}])
+        converter = Converter(records=records)
+        self.assertEqual({"chebi"}, converter.get_prefixes())
+
+
 class TestAddRecord(unittest.TestCase):
     """Test adding records."""