diff --git a/.github/scripts/generate_identifier.py b/.github/scripts/generate_identifier.py index f2a9032..e391b3f 100644 --- a/.github/scripts/generate_identifier.py +++ b/.github/scripts/generate_identifier.py @@ -14,6 +14,9 @@ def encode(name, i): return result_str +#Dan added the 'Moved Permanently' condition as a bandaid fix for repos that have been deleted +#functionality here may need improvement/rethinking. + def exists(model_id): cmd = "curl https://api.github.com/repos/ModelAtlasofTheEarth/{0}".format(model_id) output = json.loads(run_command_check_output(cmd)) diff --git a/.github/scripts/ro_crate_utils.py b/.github/scripts/ro_crate_utils.py index e5ca164..74d85ba 100644 --- a/.github/scripts/ro_crate_utils.py +++ b/.github/scripts/ro_crate_utils.py @@ -5,6 +5,9 @@ from config import MATE_DOI, NCI_RECORD, AUSCOPE_RECORD, MATE_THREDDS_BASE import re import glob +from collections.abc import MutableMapping +from fuzzywuzzy import fuzz, process + def recursively_filter_key(obj, entity_template): @@ -749,3 +752,57 @@ def replace_keys_recursive(obj): return set(replace_keys_recursive(list(obj))) else: return obj + + + +def collect_person_ids(data, person_records): + """ + Recursively collects Person records with @id and stores them in person_records. + + Args: + data: The input data structure (dict or list) to traverse. + person_records: A dictionary to store the collected Person records. + """ + if isinstance(data, list): + for item in data: + collect_person_ids(item, person_records) + elif isinstance(data, MutableMapping): + if data.get('@type') == 'Person' and '@id' in data: + key = f"{data.get('givenName')} {data.get('familyName')}" + if key not in person_records: + person_records[key] = data.get('@id') + for value in data.values(): + collect_person_ids(value, person_records) + +def assign_missing_ids(data, person_records, threshold=80): + """ + Recursively assigns missing @id to Person records using fuzzy matching. + + Args: + data: The input data structure (dict or list) to traverse. + person_records: A dictionary of collected Person records with their @id. + threshold: The minimum similarity score for fuzzy matching (default is 80). + """ + if isinstance(data, list): + for item in data: + assign_missing_ids(item, person_records, threshold) + elif isinstance(data, MutableMapping): + if data.get('@type') == 'Person' and '@id' not in data: + key = f"{data.get('givenName')} {data.get('familyName')}" + best_match = process.extractOne(key, person_records.keys(), scorer=fuzz.token_sort_ratio) + if best_match and best_match[1] >= threshold: + data['@id'] = person_records[best_match[0]] + for value in data.values(): + assign_missing_ids(value, person_records, threshold) + +def assign_ids(metadata, threshold=80): + """ + Collects Person records with @id and assigns missing @id to Person records in metadata. + + Args: + metadata: The input metadata dictionary to process. + threshold: The minimum similarity score for fuzzy matching (default is 80). + """ + person_records = {} + collect_person_ids(metadata, person_records) + assign_missing_ids(metadata, person_records, threshold) diff --git a/.github/scripts/write_repo_contents.py b/.github/scripts/write_repo_contents.py index cfa1452..6e97c48 100644 --- a/.github/scripts/write_repo_contents.py +++ b/.github/scripts/write_repo_contents.py @@ -3,7 +3,7 @@ from github import Github, Auth from parse_issue import parse_issue from crosswalks import dict_to_metadata, dict_to_yaml, dict_to_report, metadata_to_nci -from ro_crate_utils import replace_keys_recursive +from ro_crate_utils import replace_keys_recursive, assign_ids from yaml_utils import format_yaml_string from request_utils import download_license_text from copy_files import copy_files @@ -44,6 +44,8 @@ rocratestr_nested = dict_to_metadata(data, flat_compact_crate=False, timestamp= timestamp) rocratedict = json.loads(rocratestr_nested) default_context_list = copy.deepcopy(rocratedict['@context']) +#patch missign ids on Person Records +assign_ids(rocratedict['@graph']) try: diff --git a/requirements.txt b/requirements.txt index e8d4d88..5862ef3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ pygithub==2.2.0 PyLD ruamel.yaml<0.18.0 ruamel.yaml.string +fuzzywuzzy