Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix test for first entry serialization #54

Merged
merged 5 commits into from
Apr 16, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ jobs:
pip install -U setuptools wheel
pip install -e .[tests,dev]

- name: Run linters
run: |
pre-commit run --all-files

- name: Run tests
run: pytest -vv --cov-report=xml --cov-report=term ./tests

Expand Down
2 changes: 2 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
default_language_version:
python: python3.10

exclude: "scripts|src/optimade_launch"

repos:
- repo: https://github.com/ambv/black
rev: 23.3.0
Expand Down
1 change: 1 addition & 0 deletions src/optimake/archive/cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import click

from .scan_records import scan_records


Expand Down
3 changes: 1 addition & 2 deletions src/optimake/archive/scan_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
DEFAULT_ARCHIVE_URL = "https://archive.materialscloud.org/"



def process_records(records: list, archive_url: str=DEFAULT_ARCHIVE_URL):
def process_records(records: list, archive_url: str = DEFAULT_ARCHIVE_URL):
"""
Scan the Materials Cloud Archive entries, read the file info
and check if there is a file called "optimade.y(ml|aml)".
Expand Down
4 changes: 3 additions & 1 deletion src/optimake/cli.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import argparse
from pathlib import Path

from optimake.convert import convert_archive


def main():
parser = argparse.ArgumentParser(
prog="optimake",
description="Use an `optimade.yaml` config to describe archived data and create a OPTIMADE JSONL file for ingestion as an OPTIMADE API."
description="Use an `optimade.yaml` config to describe archived data and create a OPTIMADE JSONL file for ingestion as an OPTIMADE API.",
)
parser.add_argument("archive_path", help="The path to the archive to ingest.")
parser.add_argument("--jsonl-path", help="The path to write the JSONL file to.")
Expand Down
62 changes: 52 additions & 10 deletions src/optimake/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,47 @@ def _parse_entries(
return parsed_entries, entry_ids


def _set_unique_entry_ids(entry_ids: list[str]) -> list[str]:
"""Attempt to make the simplest unique set of entry IDs possible,
following a series of deterministic rules.

Parameters:
entry_ids: A list of entry IDs derived from file paths.

Returns:
A list of unique entry IDs.

"""

new_ids: list[str] = list(entry_ids)
target_num_ids = len(entry_ids)
depth: int = 0
max_depth: int = 10 # somewhat arbitrary upper limit
# Loop through each filename and try to ablate directories until a unique set arises
while len(set(new_ids)) != target_num_ids and depth < max_depth:
for i, id in enumerate(entry_ids):
new_ids[i] = "/".join(id.split("/")[-1 - depth :])
depth += 1

# Now try to ablate any common file names, e.g,. subfolders of POSCARs (1/POSCAR, 2/POSCAR)
# Loop through each filename and try to ablate directories until a unique set arises
new_ids_sans_common_filenames = [
"/".join(new_id.split("/")[0:-2]) for new_id in new_ids
]
if len(set(new_ids_sans_common_filenames)) == target_num_ids:
new_ids = new_ids_sans_common_filenames

# Now try to ablate any file extensions
new_ids_sans_extensions = [id.split(".")[0] for id in new_ids]
if len(set(new_ids_sans_extensions)) == target_num_ids:
return new_ids_sans_extensions

if len(set(new_ids)) != target_num_ids:
return entry_ids

return new_ids


ml-evs marked this conversation as resolved.
Show resolved Hide resolved
def _parse_and_assign_properties(
optimade_entries: dict[str, EntryResource],
property_matches_by_file: dict[str | None, list[Path]],
Expand Down Expand Up @@ -318,19 +359,20 @@ def _parse_and_assign_properties(

# Look for precisely matching IDs, or 'filename' matches
for id in optimade_entries:

property_entry_id = id
if id not in parsed_properties:
# detect any other compatible IDs; either those matching immutable ID or those matching the filename rule
property_entry_id = optimade_entries[id]["attributes"].get("immutable_id", None)
if property_entry_id is None:
# try to find a matching ID based on the filename
property_entry_id = id.split("/")[-1].split(".")[0]
if property_entry_id not in parsed_properties:
raise RuntimeError(
f"Found {id!r} or {property_entry_id!r} in entries but not in properties {parsed_properties.keys()=}"
)

# Loop over all defined properties and assign them to the entry, setting to None if missing
# Also cast types if provided
for property in all_property_fields:
# Loop over all defined properties and assign them to the entry, setting to None if missing
# Also cast types if provided
value = parsed_properties[property_entry_id].get(property, None)
# Look up both IDs: the file path-based ID or the ergonomic one
# Different property sources can use different ID schemes internally
value = parsed_properties.get(property_entry_id, {}).get(
property, None
) or parsed_properties.get(id, {}).get(property, None)
if property not in property_def_dict:
warnings.warn(f"Missing property definition for {property=}")
continue
Expand Down
4 changes: 3 additions & 1 deletion src/optimake/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ def load_csv_file(
return df.to_dict(orient="index")


PROPERTY_PARSERS: dict[str, list[Callable[[Path], Any]]] = {
PROPERTY_PARSERS: dict[
str, list[Callable[[Path, list[PropertyDefinition] | None], Any]]
] = {
".csv": [load_csv_file],
}

Expand Down
38 changes: 26 additions & 12 deletions tests/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
import shutil
from pathlib import Path

import numpy as np
import pytest
from optimade.models import EntryInfoResource

from optimake.convert import convert_archive

EXAMPLE_ARCHIVES = (Path(__file__).parent.parent / "examples").glob("*")
Expand All @@ -25,7 +25,7 @@ def test_convert_example_archives(archive_path, tmp_path):

jsonl_path = convert_archive(tmp_path)
assert jsonl_path.exists()

jsonl_path_custom = convert_archive(tmp_path, jsonl_path=tmp_path / "test.jsonl")
assert jsonl_path_custom.exists()

Expand Down Expand Up @@ -60,16 +60,30 @@ def test_convert_example_archives(archive_path, tmp_path):
False
), "No structures found in archive but test first entry was provided"

# @ml-evs: species is the only key that can be written in any order, so here we
# just sort before comparing. This will be fixed in the next optimade-python-tools
if species := next_entry.get("attributes", {}).get("species"):
next_entry["attributes"]["species"] = sorted(
species, key=lambda x: x["name"]
)

for key in ("id", "type", "relationships"):
assert next_entry[key] == first_entry[key]

json.dumps(first_entry["attributes"]) == json.dumps(
next_entry["attributes"]
)
def check_arrays(reference, test, field):
ref_array = reference["attributes"].pop(field, None)
if ref_array:
np.testing.assert_array_almost_equal(
ref_array, test["attributes"].pop(field)
)

# check JSON serialization of attributes compared to reference data, handling species and numerical arrays separately
array_fields = ["cartesian_site_positions", "lattice_vectors"]
for field in array_fields:
check_arrays(first_entry, next_entry, field)
first_entry.pop(field, None)
next_entry.pop(field, None)

first_entry_species = first_entry["attributes"].pop("species", None)
next_entry_species = next_entry["attributes"].pop("species", None)
if first_entry_species:
assert json.dumps(
sorted(first_entry_species, key=lambda _: _["name"])
) == json.dumps(sorted(next_entry_species, key=lambda _: _["name"]))

assert json.dumps(
first_entry["attributes"], sort_keys=True, indent=2
) == json.dumps(next_entry["attributes"], sort_keys=True, indent=2)
1 change: 0 additions & 1 deletion tests/test_yaml.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from pathlib import Path

import pytest

from optimake.config import Config

EXAMPLE_YAMLS = (Path(__file__).parent.parent / "examples").glob("*/optimade.yaml")
Expand Down
Loading