materialscloud-org · ml-evs · Apr 16, 2024 · Mar 31, 2024 · Mar 31, 2024 · Mar 31, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -38,6 +38,10 @@ jobs:
         pip install -U setuptools wheel
         pip install -e .[tests,dev]
 
+    - name: Run linters
+      run: |
+        pre-commit run --all-files
+
     - name: Run tests
       run: pytest -vv --cov-report=xml --cov-report=term ./tests
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,8 @@
 default_language_version:
   python: python3.10
 
+exclude: "scripts|src/optimade_launch"
+
 repos:
   - repo: https://github.com/ambv/black
     rev: 23.3.0

diff --git a/src/optimake/archive/cli.py b/src/optimake/archive/cli.py
@@ -1,4 +1,5 @@
 import click
+
 from .scan_records import scan_records
 
 

diff --git a/src/optimake/archive/scan_records.py b/src/optimake/archive/scan_records.py
@@ -6,8 +6,7 @@
 DEFAULT_ARCHIVE_URL = "https://archive.materialscloud.org/"
 
 
-
-def process_records(records: list, archive_url: str=DEFAULT_ARCHIVE_URL):
+def process_records(records: list, archive_url: str = DEFAULT_ARCHIVE_URL):
     """
     Scan the Materials Cloud Archive entries, read the file info
     and check if there is a file called "optimade.y(ml|aml)".

diff --git a/src/optimake/cli.py b/src/optimake/cli.py
@@ -1,11 +1,13 @@
 import argparse
 from pathlib import Path
+
 from optimake.convert import convert_archive
 
+
 def main():
     parser = argparse.ArgumentParser(
         prog="optimake",
-        description="Use an `optimade.yaml` config to describe archived data and create a OPTIMADE JSONL file for ingestion as an OPTIMADE API."
+        description="Use an `optimade.yaml` config to describe archived data and create a OPTIMADE JSONL file for ingestion as an OPTIMADE API.",
     )
     parser.add_argument("archive_path", help="The path to the archive to ingest.")
     parser.add_argument("--jsonl-path", help="The path to write the JSONL file to.")

diff --git a/src/optimake/convert.py b/src/optimake/convert.py
@@ -261,6 +261,47 @@ def _parse_entries(
     return parsed_entries, entry_ids
 
 
+def _set_unique_entry_ids(entry_ids: list[str]) -> list[str]:
+    """Attempt to make the simplest unique set of entry IDs possible,
+    following a series of deterministic rules.
+
+    Parameters:
+        entry_ids: A list of entry IDs derived from file paths.
+
+    Returns:
+        A list of unique entry IDs.
+
+    """
+
+    new_ids: list[str] = list(entry_ids)
+    target_num_ids = len(entry_ids)
+    depth: int = 0
+    max_depth: int = 10  # somewhat arbitrary upper limit
+    # Loop through each filename and try to ablate directories until a unique set arises
+    while len(set(new_ids)) != target_num_ids and depth < max_depth:
+        for i, id in enumerate(entry_ids):
+            new_ids[i] = "/".join(id.split("/")[-1 - depth :])
+        depth += 1
+
+    # Now try to ablate any common file names, e.g,. subfolders of POSCARs (1/POSCAR, 2/POSCAR)
+    # Loop through each filename and try to ablate directories until a unique set arises
+    new_ids_sans_common_filenames = [
+        "/".join(new_id.split("/")[0:-2]) for new_id in new_ids
+    ]
+    if len(set(new_ids_sans_common_filenames)) == target_num_ids:
+        new_ids = new_ids_sans_common_filenames
+
+    # Now try to ablate any file extensions
+    new_ids_sans_extensions = [id.split(".")[0] for id in new_ids]
+    if len(set(new_ids_sans_extensions)) == target_num_ids:
+        return new_ids_sans_extensions
+
+    if len(set(new_ids)) != target_num_ids:
+        return entry_ids
+
+    return new_ids
+
+
 def _parse_and_assign_properties(
     optimade_entries: dict[str, EntryResource],
     property_matches_by_file: dict[str | None, list[Path]],
@@ -318,19 +359,20 @@ def _parse_and_assign_properties(
 
     # Look for precisely matching IDs, or 'filename' matches
     for id in optimade_entries:
-
-        property_entry_id = id
-        if id not in parsed_properties:
+        # detect any other compatible IDs; either those matching immutable ID or those matching the filename rule
+        property_entry_id = optimade_entries[id]["attributes"].get("immutable_id", None)
+        if property_entry_id is None:
+            # try to find a matching ID based on the filename
             property_entry_id = id.split("/")[-1].split(".")[0]
-            if property_entry_id not in parsed_properties:
-                raise RuntimeError(
-                    f"Found {id!r} or {property_entry_id!r} in entries but not in properties {parsed_properties.keys()=}"
-                )
 
+        # Loop over all defined properties and assign them to the entry, setting to None if missing
+        # Also cast types if provided
         for property in all_property_fields:
-            # Loop over all defined properties and assign them to the entry, setting to None if missing
-            # Also cast types if provided
-            value = parsed_properties[property_entry_id].get(property, None)
+            # Look up both IDs: the file path-based ID or the ergonomic one
+            # Different property sources can use different ID schemes internally
+            value = parsed_properties.get(property_entry_id, {}).get(
+                property, None
+            ) or parsed_properties.get(id, {}).get(property, None)
             if property not in property_def_dict:
                 warnings.warn(f"Missing property definition for {property=}")
                 continue

diff --git a/src/optimake/parsers.py b/src/optimake/parsers.py
@@ -61,7 +61,9 @@ def load_csv_file(
     return df.to_dict(orient="index")
 
 
-PROPERTY_PARSERS: dict[str, list[Callable[[Path], Any]]] = {
+PROPERTY_PARSERS: dict[
+    str, list[Callable[[Path, list[PropertyDefinition] | None], Any]]
+] = {
     ".csv": [load_csv_file],
 }
 

diff --git a/tests/test_convert.py b/tests/test_convert.py
@@ -2,9 +2,9 @@
 import shutil
 from pathlib import Path
 
+import numpy as np
 import pytest
 from optimade.models import EntryInfoResource
-
 from optimake.convert import convert_archive
 
 EXAMPLE_ARCHIVES = (Path(__file__).parent.parent / "examples").glob("*")
@@ -25,7 +25,7 @@ def test_convert_example_archives(archive_path, tmp_path):
 
     jsonl_path = convert_archive(tmp_path)
     assert jsonl_path.exists()
-    
+
     jsonl_path_custom = convert_archive(tmp_path, jsonl_path=tmp_path / "test.jsonl")
     assert jsonl_path_custom.exists()
 
@@ -60,16 +60,30 @@ def test_convert_example_archives(archive_path, tmp_path):
                     False
                 ), "No structures found in archive but test first entry was provided"
 
-            # @ml-evs: species is the only key that can be written in any order, so here we
-            # just sort before comparing. This will be fixed in the next optimade-python-tools
-            if species := next_entry.get("attributes", {}).get("species"):
-                next_entry["attributes"]["species"] = sorted(
-                    species, key=lambda x: x["name"]
-                )
-
             for key in ("id", "type", "relationships"):
                 assert next_entry[key] == first_entry[key]
 
-            json.dumps(first_entry["attributes"]) == json.dumps(
-                next_entry["attributes"]
-            )
+            def check_arrays(reference, test, field):
+                ref_array = reference["attributes"].pop(field, None)
+                if ref_array:
+                    np.testing.assert_array_almost_equal(
+                        ref_array, test["attributes"].pop(field)
+                    )
+
+            # check JSON serialization of attributes compared to reference data, handling species and numerical arrays separately
+            array_fields = ["cartesian_site_positions", "lattice_vectors"]
+            for field in array_fields:
+                check_arrays(first_entry, next_entry, field)
+                first_entry.pop(field, None)
+                next_entry.pop(field, None)
+
+            first_entry_species = first_entry["attributes"].pop("species", None)
+            next_entry_species = next_entry["attributes"].pop("species", None)
+            if first_entry_species:
+                assert json.dumps(
+                    sorted(first_entry_species, key=lambda _: _["name"])
+                ) == json.dumps(sorted(next_entry_species, key=lambda _: _["name"]))
+
+            assert json.dumps(
+                first_entry["attributes"], sort_keys=True, indent=2
+            ) == json.dumps(next_entry["attributes"], sort_keys=True, indent=2)
diff --git a/tests/test_yaml.py b/tests/test_yaml.py
@@ -1,7 +1,6 @@
 from pathlib import Path
 
 import pytest
-
 from optimake.config import Config
 
 EXAMPLE_YAMLS = (Path(__file__).parent.parent / "examples").glob("*/optimade.yaml")