Skip to content

Commit

Permalink
add functionaility to patch missing @ids on Person records. Uses Fuzz…
Browse files Browse the repository at this point in the history
…y Wuzzy to match differences in names
  • Loading branch information
dansand committed May 15, 2024
1 parent af13f58 commit a4f7720
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .github/scripts/generate_identifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ def encode(name, i):
return result_str


#Dan added the 'Moved Permanently' condition as a bandaid fix for repos that have been deleted
#functionality here may need improvement/rethinking.

def exists(model_id):
cmd = "curl https://api.github.com/repos/ModelAtlasofTheEarth/{0}".format(model_id)
output = json.loads(run_command_check_output(cmd))
Expand Down
57 changes: 57 additions & 0 deletions .github/scripts/ro_crate_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from config import MATE_DOI, NCI_RECORD, AUSCOPE_RECORD, MATE_THREDDS_BASE
import re
import glob
from collections.abc import MutableMapping
from fuzzywuzzy import fuzz, process


def recursively_filter_key(obj, entity_template):

Expand Down Expand Up @@ -749,3 +752,57 @@ def replace_keys_recursive(obj):
return set(replace_keys_recursive(list(obj)))
else:
return obj



def collect_person_ids(data, person_records):
"""
Recursively collects Person records with @id and stores them in person_records.
Args:
data: The input data structure (dict or list) to traverse.
person_records: A dictionary to store the collected Person records.
"""
if isinstance(data, list):
for item in data:
collect_person_ids(item, person_records)
elif isinstance(data, MutableMapping):
if data.get('@type') == 'Person' and '@id' in data:
key = f"{data.get('givenName')} {data.get('familyName')}"
if key not in person_records:
person_records[key] = data.get('@id')
for value in data.values():
collect_person_ids(value, person_records)

def assign_missing_ids(data, person_records, threshold=80):
"""
Recursively assigns missing @id to Person records using fuzzy matching.
Args:
data: The input data structure (dict or list) to traverse.
person_records: A dictionary of collected Person records with their @id.
threshold: The minimum similarity score for fuzzy matching (default is 80).
"""
if isinstance(data, list):
for item in data:
assign_missing_ids(item, person_records, threshold)
elif isinstance(data, MutableMapping):
if data.get('@type') == 'Person' and '@id' not in data:
key = f"{data.get('givenName')} {data.get('familyName')}"
best_match = process.extractOne(key, person_records.keys(), scorer=fuzz.token_sort_ratio)
if best_match and best_match[1] >= threshold:
data['@id'] = person_records[best_match[0]]
for value in data.values():
assign_missing_ids(value, person_records, threshold)

def assign_ids(metadata, threshold=80):
"""
Collects Person records with @id and assigns missing @id to Person records in metadata.
Args:
metadata: The input metadata dictionary to process.
threshold: The minimum similarity score for fuzzy matching (default is 80).
"""
person_records = {}
collect_person_ids(metadata, person_records)
assign_missing_ids(metadata, person_records, threshold)
4 changes: 3 additions & 1 deletion .github/scripts/write_repo_contents.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from github import Github, Auth
from parse_issue import parse_issue
from crosswalks import dict_to_metadata, dict_to_yaml, dict_to_report, metadata_to_nci
from ro_crate_utils import replace_keys_recursive
from ro_crate_utils import replace_keys_recursive, assign_ids
from yaml_utils import format_yaml_string
from request_utils import download_license_text
from copy_files import copy_files
Expand Down Expand Up @@ -44,6 +44,8 @@
rocratestr_nested = dict_to_metadata(data, flat_compact_crate=False, timestamp= timestamp)
rocratedict = json.loads(rocratestr_nested)
default_context_list = copy.deepcopy(rocratedict['@context'])
#patch missign ids on Person Records
assign_ids(rocratedict['@graph'])

try:

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ pygithub==2.2.0
PyLD
ruamel.yaml<0.18.0
ruamel.yaml.string
fuzzywuzzy

0 comments on commit a4f7720

Please sign in to comment.