From d964b802a0976a03d4faa37588d414b46c5009ec Mon Sep 17 00:00:00 2001 From: dansand Date: Wed, 22 May 2024 16:57:40 +1000 Subject: [PATCH] more flexibility around csv input and new line --- .github/scripts/parse_issue.py | 14 +++++++++----- .github/scripts/parse_utils.py | 24 ++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/.github/scripts/parse_issue.py b/.github/scripts/parse_issue.py index d4a8875..e542480 100644 --- a/.github/scripts/parse_issue.py +++ b/.github/scripts/parse_issue.py @@ -3,7 +3,7 @@ from collections import defaultdict from request_utils import get_record, check_uri from parse_metadata_utils import parse_publication, parse_software, parse_organization -from parse_utils import parse_name_or_orcid, parse_yes_no_choice, get_authors, get_funders, process_funding_data, parse_image_and_caption, validate_slug, extract_doi_parts, extract_orcid, remove_duplicates, parse_size +from parse_utils import parse_name_or_orcid, parse_yes_no_choice, get_authors, get_funders, process_funding_data, parse_image_and_caption, validate_slug, extract_doi_parts, extract_orcid, remove_duplicates, parse_size, identify_separator, separate_string from dateutil import parser from datetime import datetime @@ -182,7 +182,8 @@ def parse_issue(issue): error_log += "**Software framework authors**\n" + log # software & algorithm keywords - software_keywords = [x.strip() for x in data["-> software & algorithm keywords"].split(",")] + #software_keywords = [x.strip() for x in data["-> software & algorithm keywords"].split(",")] + software_keywords = separate_string(data["-> software & algorithm keywords"]) #if software_keywords[0] == "_No response_": if null_response_check(software_keywords[0]): @@ -303,7 +304,8 @@ def parse_issue(issue): data_dict["license"] = license_record # model category - model_category = [x.strip() for x in data["-> model category"].split(",")] + #model_category = [x.strip() for x in data["-> model category"].split(",")] + model_category = separate_string(data["-> model category"]) #if model_category[0] == "_No response_": if null_response_check(model_category[0]): @@ -317,7 +319,8 @@ def parse_issue(issue): # model status model_status = [] try: - model_status = [x.strip() for x in data["-> model status"].split(",")] + #model_status = [x.strip() for x in data["-> model status"].split(",")] + model_status = separate_string(data["-> model status"]) #if model_status[0] == "_No response_": if null_response_check(model_status[0]): @@ -361,7 +364,8 @@ def parse_issue(issue): data_dict["description"] = description # scientific keywords - keywords = [x.strip() for x in data["-> scientific keywords"].split(",")] + #keywords = [x.strip() for x in data["-> scientific keywords"].split(",")] + keywords = separate_string(data["-> scientific keywords"]) #if keywords[0] == "_No response_": if null_response_check(keywords[0]): diff --git a/.github/scripts/parse_utils.py b/.github/scripts/parse_utils.py index b03c328..768651c 100644 --- a/.github/scripts/parse_utils.py +++ b/.github/scripts/parse_utils.py @@ -420,3 +420,27 @@ def process_funding_data(input_string): return {'funders': schema_funders, 'funding': schema_funding} + + + +def identify_separator(input_string): + # Strip leading and trailing whitespace and split by newline to get lines + lines = input_string.strip().split('\n') + + # Count the number of commas and newlines + comma_count = sum(line.count(',') for line in lines) + newline_count = len(lines) - 1 + + # Heuristics: more commas than newlines => CSV + if comma_count > newline_count: + return 'csv' + else: + return 'newline' + +def separate_string(input_string): + separator_type = identify_separator(input_string) + + if separator_type == 'csv': + return [x.strip() for line in input_string.strip().split('\n') for x in line.split(',')] + elif separator_type == 'newline': + return [line.strip() for line in input_string.strip().split('\n') if line]