feat: Add support for DCAT creator field in dataset metadata

- Added fields to store creator details (name, email, URL, and identifier) in the DCAT profile. - Implemented functionality to serialize and deserialize creator information similar to the publisher. - Updated RDF generation logic to include creator fields in the output graph. - Enhanced unit tests to verify proper handling and serialization of creator metadata.
ckan · Oct 3, 2024 · d8461e2 · d8461e2
1 parent d5b8e92
commit d8461e2
Show file tree

Hide file tree

Showing 8 changed files with 92 additions and 118 deletions.
diff --git a/ckanext/dcat/converters.py b/ckanext/dcat/converters.py
@@ -29,8 +29,8 @@ def dcat_to_ckan(dcat_dict):
     elif isinstance(dcat_publisher, dict) and dcat_publisher.get('name'):
         package_dict['extras'].append({'key': 'dcat_publisher_name', 'value': dcat_publisher.get('name')})
 
-        if dcat_publisher.get('mbox'):
-            package_dict['extras'].append({'key': 'dcat_publisher_email', 'value': dcat_publisher.get('mbox')})
+        if dcat_publisher.get('email'):
+            package_dict['extras'].append({'key': 'dcat_publisher_email', 'value': dcat_publisher.get('email')})
 
         if dcat_publisher.get('identifier'):
             package_dict['extras'].append({
@@ -45,8 +45,8 @@ def dcat_to_ckan(dcat_dict):
         if dcat_creator.get('name'):
             package_dict['extras'].append({'key': 'dcat_creator_name', 'value': dcat_creator.get('name')})
 
-        if dcat_creator.get('mbox'):
-            package_dict['extras'].append({'key': 'dcat_creator_email', 'value': dcat_creator.get('mbox')})
+        if dcat_creator.get('email'):
+            package_dict['extras'].append({'key': 'dcat_creator_email', 'value': dcat_creator.get('email')})
 
         if dcat_creator.get('identifier'):
             package_dict['extras'].append({
@@ -106,7 +106,7 @@ def ckan_to_dcat(package_dict):
             dcat_dict['publisher']['name'] = extra['value']
 
         elif extra['key'] == 'dcat_publisher_email':
-            dcat_dict['publisher']['mbox'] = extra['value']
+            dcat_dict['publisher']['email'] = extra['value']
 
         elif extra['key'] == 'dcat_publisher_id':
             dcat_dict['publisher']['identifier'] = extra['value']
@@ -116,7 +116,7 @@ def ckan_to_dcat(package_dict):
             dcat_dict['creator']['name'] = extra['value']
 
         elif extra['key'] == 'dcat_creator_email':
-            dcat_dict['creator']['mbox'] = extra['value']
+            dcat_dict['creator']['email'] = extra['value']
 
         elif extra['key'] == 'dcat_creator_id':
             dcat_dict['creator']['identifier'] = extra['value']
@@ -129,13 +129,13 @@ def ckan_to_dcat(package_dict):
     if not dcat_dict['publisher'].get('name') and package_dict.get('maintainer'):
         dcat_dict['publisher']['name'] = package_dict.get('maintainer')
         if package_dict.get('maintainer_email'):
-            dcat_dict['publisher']['mbox'] = package_dict.get('maintainer_email')
+            dcat_dict['publisher']['email'] = package_dict.get('maintainer_email')
 
     # Fallback for creator (if no name in extras, optionally use author)
     if not dcat_dict['creator'].get('name') and package_dict.get('author'):
         dcat_dict['creator']['name'] = package_dict.get('author')
         if package_dict.get('author_email'):
-            dcat_dict['creator']['mbox'] = package_dict.get('author_email')
+            dcat_dict['creator']['email'] = package_dict.get('author_email')
 
     dcat_dict['distribution'] = []
     for resource in package_dict.get('resources', []):

diff --git a/ckanext/dcat/profiles/euro_dcat_ap_base.py b/ckanext/dcat/profiles/euro_dcat_ap_base.py
@@ -50,10 +50,10 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref):
 
         # Basic fields
         for key, predicate in (
-                ("title", DCT.title),
-                ("notes", DCT.description),
-                ("url", DCAT.landingPage),
-                ("version", OWL.versionInfo),
+            ("title", DCT.title),
+            ("notes", DCT.description),
+            ("url", DCAT.landingPage),
+            ("version", OWL.versionInfo),
         ):
             value = self._object_value(dataset_ref, predicate)
             if value:
@@ -78,30 +78,30 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref):
 
         #  Simple values
         for key, predicate in (
-                ("issued", DCT.issued),
-                ("modified", DCT.modified),
-                ("identifier", DCT.identifier),
-                ("version_notes", ADMS.versionNotes),
-                ("frequency", DCT.accrualPeriodicity),
-                ("provenance", DCT.provenance),
-                ("dcat_type", DCT.type),
+            ("issued", DCT.issued),
+            ("modified", DCT.modified),
+            ("identifier", DCT.identifier),
+            ("version_notes", ADMS.versionNotes),
+            ("frequency", DCT.accrualPeriodicity),
+            ("provenance", DCT.provenance),
+            ("dcat_type", DCT.type),
         ):
             value = self._object_value(dataset_ref, predicate)
             if value:
                 dataset_dict["extras"].append({"key": key, "value": value})
 
         #  Lists
         for key, predicate, in (
-                ("language", DCT.language),
-                ("theme", DCAT.theme),
-                ("alternate_identifier", ADMS.identifier),
-                ("conforms_to", DCT.conformsTo),
-                ("documentation", FOAF.page),
-                ("related_resource", DCT.relation),
-                ("has_version", DCT.hasVersion),
-                ("is_version_of", DCT.isVersionOf),
-                ("source", DCT.source),
-                ("sample", ADMS.sample),
+            ("language", DCT.language),
+            ("theme", DCAT.theme),
+            ("alternate_identifier", ADMS.identifier),
+            ("conforms_to", DCT.conformsTo),
+            ("documentation", FOAF.page),
+            ("related_resource", DCT.relation),
+            ("has_version", DCT.hasVersion),
+            ("is_version_of", DCT.isVersionOf),
+            ("source", DCT.source),
+            ("sample", ADMS.sample),
         ):
             values = self._object_value_list(dataset_ref, predicate)
             if values:
@@ -177,14 +177,14 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref):
 
             #  Simple values
             for key, predicate in (
-                    ("name", DCT.title),
-                    ("description", DCT.description),
-                    ("access_url", DCAT.accessURL),
-                    ("download_url", DCAT.downloadURL),
-                    ("issued", DCT.issued),
-                    ("modified", DCT.modified),
-                    ("status", ADMS.status),
-                    ("license", DCT.license),
+                ("name", DCT.title),
+                ("description", DCT.description),
+                ("access_url", DCAT.accessURL),
+                ("download_url", DCAT.downloadURL),
+                ("issued", DCT.issued),
+                ("modified", DCT.modified),
+                ("status", ADMS.status),
+                ("license", DCT.license),
             ):
                 value = self._object_value(distribution, predicate)
                 if value:
@@ -195,9 +195,9 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref):
             ) or self._object_value(distribution, DCAT.accessURL)
             #  Lists
             for key, predicate in (
-                    ("language", DCT.language),
-                    ("documentation", FOAF.page),
-                    ("conforms_to", DCT.conformsTo),
+                ("language", DCT.language),
+                ("documentation", FOAF.page),
+                ("conforms_to", DCT.conformsTo),
             ):
                 values = self._object_value_list(distribution, predicate)
                 if values:
@@ -252,10 +252,10 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref):
             # versions of the ckanext-dcat parsers
             for extra in dataset_dict["extras"]:
                 if extra["key"] in (
-                        "issued",
-                        "modified",
-                        "publisher_name",
-                        "publisher_email",
+                    "issued",
+                    "modified",
+                    "publisher_name",
+                    "publisher_email",
                 ):
                     extra["key"] = "dcat_" + extra["key"]
 
@@ -315,15 +315,15 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref):
 
         # Contact details
         if any(
-                [
-                    self._get_dataset_value(dataset_dict, "contact_uri"),
-                    self._get_dataset_value(dataset_dict, "contact_name"),
-                    self._get_dataset_value(dataset_dict, "contact_email"),
-                    self._get_dataset_value(dataset_dict, "maintainer"),
-                    self._get_dataset_value(dataset_dict, "maintainer_email"),
-                    self._get_dataset_value(dataset_dict, "author"),
-                    self._get_dataset_value(dataset_dict, "author_email"),
-                ]
+            [
+                self._get_dataset_value(dataset_dict, "contact_uri"),
+                self._get_dataset_value(dataset_dict, "contact_name"),
+                self._get_dataset_value(dataset_dict, "contact_email"),
+                self._get_dataset_value(dataset_dict, "maintainer"),
+                self._get_dataset_value(dataset_dict, "maintainer_email"),
+                self._get_dataset_value(dataset_dict, "author"),
+                self._get_dataset_value(dataset_dict, "author_email"),
+            ]
         ):
 
             contact_uri = self._get_dataset_value(dataset_dict, "contact_uri")
@@ -360,10 +360,10 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref):
             # Scheming publisher field: will be handled in a separate profile
             pass
         elif any(
-                [
-                    self._get_dataset_value(dataset_dict, "publisher_uri"),
-                    self._get_dataset_value(dataset_dict, "publisher_name"),
-                ]
+            [
+                self._get_dataset_value(dataset_dict, "publisher_uri"),
+                self._get_dataset_value(dataset_dict, "publisher_name"),
+            ]
         ):
             # Legacy publisher_* extras
             publisher_uri = self._get_dataset_value(dataset_dict, "publisher_uri")
@@ -425,10 +425,10 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref):
             # Scheming publisher field: will be handled in a separate profile
             pass
         elif any(
-                [
-                    self._get_dataset_value(dataset_dict, "creator_uri"),
-                    self._get_dataset_value(dataset_dict, "creator_name"),
-                ]
+            [
+                self._get_dataset_value(dataset_dict, "creator_uri"),
+                self._get_dataset_value(dataset_dict, "creator_name"),
+            ]
         ):
             # Legacy creator_* extras
             creator_uri = self._get_dataset_value(dataset_dict, "creator_uri")
@@ -492,11 +492,11 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref):
         resource_license_fallback = None
         if toolkit.asbool(config.get(DISTRIBUTION_LICENSE_FALLBACK_CONFIG, False)):
             if "license_id" in dataset_dict and isinstance(
-                    URIRefOrLiteral(dataset_dict["license_id"]), URIRef
+                URIRefOrLiteral(dataset_dict["license_id"]), URIRef
             ):
                 resource_license_fallback = dataset_dict["license_id"]
             elif "license_url" in dataset_dict and isinstance(
-                    URIRefOrLiteral(dataset_dict["license_url"]), URIRef
+                URIRefOrLiteral(dataset_dict["license_url"]), URIRef
             ):
                 resource_license_fallback = dataset_dict["license_url"]
 
@@ -559,9 +559,9 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref):
             # check which type is appropriate.
             if fmt and (not mimetype or mimetype == fmt):
                 if (
-                        "iana.org/assignments/media-types" in fmt
-                        or not fmt.startswith("http")
-                        and "/" in fmt
+                    "iana.org/assignments/media-types" in fmt
+                    or not fmt.startswith("http")
+                    and "/" in fmt
                 ):
                     # output format value as dcat:mediaType instead of dct:format
                     mimetype = fmt

diff --git a/ckanext/dcat/profiles/schemaorg.py b/ckanext/dcat/profiles/schemaorg.py
@@ -167,11 +167,11 @@ def _agent_graph(self, dataset_ref, dataset_dict, agent_type, schema_property_pr
         identifier_key = f"{schema_property_prefix}_identifier"
 
         if any(
-                [
-                    self._get_dataset_value(dataset_dict, uri_key),
-                    self._get_dataset_value(dataset_dict, name_key),
-                    dataset_dict.get("organization"),
-                ]
+            [
+                self._get_dataset_value(dataset_dict, uri_key),
+                self._get_dataset_value(dataset_dict, name_key),
+                dataset_dict.get("organization"),
+            ]
         ):
             agent_uri = self._get_dataset_value(dataset_dict, uri_key)
             agent_uri_fallback = publisher_uri_organization_fallback(dataset_dict)
@@ -188,9 +188,9 @@ def _agent_graph(self, dataset_ref, dataset_dict, agent_type, schema_property_pr
             self.g.add((dataset_ref, agent_type, agent_details))
 
             if (
-                    not agent_name
-                    and not agent_uri
-                    and dataset_dict.get("organization")
+                not agent_name
+                and not agent_uri
+                and dataset_dict.get("organization")
             ):
                 agent_name = dataset_dict["organization"]["title"]
             self.g.add((agent_details, SCHEMA.name, Literal(agent_name)))

diff --git a/ckanext/dcat/schemas/dcat_ap_recommended.yaml b/ckanext/dcat/schemas/dcat_ap_recommended.yaml
@@ -72,40 +72,6 @@ dataset_fields:
       help_text: Unique identifier for the publisher, such as a ROR ID.
   help_text: Entity responsible for making the dataset available.
 
-- field_name: creator
-  label: Creator
-  repeating_label: Creator
-  repeating_once: true
-  repeating_subfields:
-
-    - field_name: uri
-      label: URI
-      help_text: URI of the creator, if available.
-
-    - field_name: name
-      label: Name
-      help_text: Name of the entity or person who created the dataset.
-
-    - field_name: email
-      label: Email
-      display_snippet: email.html
-      help_text: Contact email of the creator.
-
-    - field_name: url
-      label: URL
-      display_snippet: link.html
-      help_text: URL for more information about the creator.
-
-    - field_name: type
-      label: Type
-      help_text: Type of creator (e.g., Organization, Person).
-
-    - field_name: identifier
-      label: Identifier
-      help_text: Unique identifier for the creator, such as an ORCID or ROR ID.
-
-  help_text: Entity responsible for creating the dataset.
-
 - field_name: license_id
   label: License
   form_snippet: license.html

diff --git a/ckanext/dcat/tests/profiles/base/test_base_profile.py b/ckanext/dcat/tests/profiles/base/test_base_profile.py
@@ -660,7 +660,7 @@ def test_publisher_foaf(self):
 
         p = RDFProfile(g)
 
-        publisher = p._publisher(URIRef('http://example.org'), DCT.publisher)
+        publisher = p._agent_details(URIRef('http://example.org'), DCT.publisher)
 
         assert publisher['uri'] == 'http://orgs.vocab.org/some-org'
         assert publisher['name'] == 'Publishing Organization for dataset 1'
@@ -688,7 +688,7 @@ def test_publisher_ref(self):
 
         p = RDFProfile(g)
 
-        publisher = p._publisher(URIRef('http://example.org'), DCT.publisher)
+        publisher = p._agent_details(URIRef('http://example.org'), DCT.publisher)
 
         assert publisher['uri'] == 'http://orgs.vocab.org/some-org'
 

diff --git a/examples/ckan/ckan_dataset.json b/examples/ckan/ckan_dataset.json
@@ -8,6 +8,8 @@
             {"key": "guid", "value": "9df8df51-63db-37a8-e044-0003ba9b0d98"},
             {"key": "dcat_publisher_name", "value": "Geological Society"},
             {"key": "dcat_publisher_email", "value": "info@gs.org"},
+            {"key": "dcat_creator_name", "value": "John Doe"},
+            {"key": "dcat_creator_email", "value": "johndoe@example.com"},
             {"key": "language", "value": "en,es,ca"}
     ],
     "resources": [{"id": "b1e0b666-b7f4-44c1-9b16-56c78e86b66a",

diff --git a/examples/ckan/dataset.json b/examples/ckan/dataset.json
@@ -9,11 +9,17 @@
     "keyword" : ["exploration", "geochemical-exploration", "geochemical-maps", "geochemistry", "geology", "nercddc", "regional-geology"],
     "publisher": {
         "name": "Geological Society",
-        "mbox": "info@gs.org"
+        "email": "info@gs.org"
     },
-    "distribution": [{"accessURL": "http://www.bgs.ac.uk/gbase/geochemcd/home.html",
-                       "byteSize": null,
-                       "description": "Resource locator",
-                       "format": "text/html",
-                       "title": ""}]
+    "creator": {
+        "name": "John Doe",
+        "email": "johndoe@example.com"
+    },
+    "distribution": [{
+        "accessURL": "http://www.bgs.ac.uk/gbase/geochemcd/home.html",
+        "byteSize": null,
+        "description": "Resource locator",
+        "format": "text/html",
+        "title": ""
+    }]
 }
diff --git a/examples/ckan/full_ckan_dataset.json b/examples/ckan/full_ckan_dataset.json
@@ -1,6 +1,6 @@
 {
-    "author": null,
-    "author_email": null,
+    "author": "John Doe",
+    "author_email": "johndoe@example.com",
     "extras": [
         {
             "__extras": {