Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/ted 717 #282

Merged
merged 11 commits into from
Sep 26, 2022
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def open_local(paths, mode="r", encoding="utf8"):
"validation_summary_runner = ted_sws.notice_validator.entrypoints.cli.cmd_validation_summary_runner:main",
"triple_store_loader = ted_sws.mapping_suite_processor.entrypoints.cli.cmd_triple_store_loader:main",
"mapping_suite_validator = ted_sws.mapping_suite_processor.entrypoints.cli.cmd_mapping_suite_validator:main",
"metadata_generator = ted_sws.mapping_suite_processor.entrypoints.cli.cmd_mapping_suite_validator:main",
"metadata_generator = ted_sws.mapping_suite_processor.entrypoints.cli.cmd_metadata_generator:main",

"mapping_suite_processor = ted_sws.mapping_suite_processor.entrypoints.cli.cmd_mapping_suite_processor:main",
"yarrrml2rml_converter = ted_sws.mapping_suite_processor.entrypoints.cli.cmd_yarrrml2rml_converter:main",
Expand Down
6 changes: 3 additions & 3 deletions ted_sws/core/model/manifestation.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class SPARQLQueryRefinedResultType(Enum):
The aggregated SPARQL Query result
"""
VALID = "valid"
UNVERIFIABLE = "unverifiable"
INVALID = "invalid"
ERROR = "error"
WARNING = "warning"
Expand Down Expand Up @@ -70,9 +71,7 @@ class XPATHCoverageValidationAssertion(PropertyBaseModel):
"""

"""
standard_form_field_id: Optional[str]
eform_bt_id: Optional[str]
title: Optional[str]
form_field: Optional[str]
xpath: Optional[str]
count: Optional[int]
notice_hit: Optional[Dict[str, int]]
Expand Down Expand Up @@ -222,6 +221,7 @@ class XMLManifestationValidationSummaryReport(PropertyBaseModel):

class SPARQLSummaryCountReport(PropertyBaseModel):
valid: Optional[int] = 0
unverifiable: Optional[int] = 0
invalid: Optional[int] = 0
warning: Optional[int] = 0
error: Optional[int] = 0
Expand Down
4 changes: 1 addition & 3 deletions ted_sws/core/model/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,7 @@ class TransformationTestData(MappingSuiteComponent):

class ConceptualMappingXPATH(MappingSuiteComponent):
xpath: str
name: Optional[str]
standard_form_field_id: Optional[str]
eform_bt_id: Optional[str]
form_field: Optional[str]


class ConceptualMappingMetadata(MappingSuiteComponent):
Expand Down
11 changes: 6 additions & 5 deletions ted_sws/data_sampler/services/notice_xml_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,12 @@ def _notice_namespaces(xml_file) -> dict:

def _ns_tag(ns_tag):
tag = ns_tag[1]
ns = ns_tag[0]
if ns:
ns_alias = namespaces[ns]
if ns_alias:
return ns_alias + ":" + tag
# Use just the tag, ignoring the namespace
# ns = ns_tag[0]
# if ns:
# ns_alias = namespaces[ns]
# if ns_alias:
# return ns_alias + ":" + tag
return tag

def _xpath_generator(xml_file):
Expand Down
3 changes: 3 additions & 0 deletions ted_sws/event_manager/services/logger_from_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,11 @@ def get_env_logger(logger: EventLogger, is_cli: bool = False) -> EventLogger:


global_logger: EventLogger = get_env_logger(EventLogger(DAGLoggerConfig()))
global_loggers: Dict[str, EventLogger] = {}
global_cli_logger: EventLogger = get_env_logger(EventLogger(CLILoggerConfig()), is_cli=True)
global_cli_loggers: Dict[str, EventLogger] = {}
global_console_logger: EventLogger = get_env_logger(EventLogger(ConsoleLoggerConfig()), is_cli=True)
global_console_loggers: Dict[str, EventLogger] = {}


def get_logger(name: str = None) -> EventLogger:
Expand Down
1 change: 1 addition & 0 deletions ted_sws/mapping_suite_processor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
RULES_E_FORM_BT_NAME = 'eForm BT Name (Provisional/Indicative) (O)'
RULES_FIELD_XPATH = 'Field XPath (M)'
RULES_SF_FIELD_ID = 'Standard Form Field ID (M)'
RULES_SF_FIELD_NAME = 'Standard Form Field Name (M)'
RULES_E_FORM_BT_ID = 'eForm BT-ID (Provisional/Indicative) (O)'
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,19 @@

from ted_sws.event_manager.services.log import log_cli_brief_error
from ted_sws.mapping_suite_processor import CONCEPTUAL_MAPPINGS_METADATA_SHEET_NAME, \
CONCEPTUAL_MAPPINGS_RULES_SHEET_NAME, RULES_FIELD_XPATH, RULES_E_FORM_BT_NAME, RULES_SF_FIELD_ID, RULES_E_FORM_BT_ID
CONCEPTUAL_MAPPINGS_RULES_SHEET_NAME, RULES_FIELD_XPATH, RULES_E_FORM_BT_NAME, RULES_SF_FIELD_ID, \
RULES_E_FORM_BT_ID, RULES_SF_FIELD_NAME
from ted_sws.notice_validator import BASE_XPATH_FIELD
from ted_sws.resources.prefixes import PREFIXES_DEFINITIONS

RULES_SF_FIELD_NAME = 'Standard Form Field Name (M)'
RULES_CLASS_PATH = 'Class path (M)'
RULES_PROPERTY_PATH = 'Property path (M)'

CL_FIELD_VALUE = 'Field Value (in XML)'
CL_MAPPING_REFERENCE = 'Mapping Reference (in ePO)'
CL_SUPER_TYPE = 'SuperType'
CL_XPATH_FRAGMENT = 'XML PATH Fragment'

DEFAULT_RQ_NAME = 'sparql_query_'

SPARQL_PREFIX_PATTERN = re.compile('(?:\\s+|^)(\\w+)?:')
Expand All @@ -33,13 +38,56 @@ def concat_field_xpath(base_xpath: str, field_xpath: str, separator: str = ", ")
return separator.join([base_xpath + xpath for xpath in field_xpath.splitlines()])


def sparql_validation_generator(data: pd.DataFrame, base_xpath: str,
prefixes_definitions) -> Iterator[str]:
def _get_elem_reference(class_value: str, cl_dfs: dict, field_xpath: list) -> str:
if '(from ' in class_value:

# Find CL sheet
cl_id = class_value.split()[-1][:-1]
cl_sheet: pd.DataFrame() = pd.DataFrame()
for sheet_name in cl_dfs:
if sheet_name.startswith(cl_id):
cl_sheet = cl_dfs[sheet_name]

# Find elem type
if not cl_sheet.empty:
class_value = class_value.split()[0]
for index, row in cl_sheet.iterrows():
class_super_type = row[CL_SUPER_TYPE]
xpath_fragment = row[CL_XPATH_FRAGMENT]
for field_xpath_fragment in reversed(field_xpath):
if class_value == class_super_type and field_xpath_fragment == xpath_fragment:
return row[CL_MAPPING_REFERENCE]
else:
return class_value

return ''


def _generate_subject_type(class_path: str, cl_dfs: dict, field_xpath: str) -> str:
subject_reference = _get_elem_reference(class_path.split(' / ')[0], cl_dfs,
field_xpath.split('/') if not pd.isna(field_xpath) else '')
return f"?this rdf:type {subject_reference} ." if subject_reference else ''


def _generate_object_type(class_path: str, cl_dfs: dict, field_xpath: str) -> str:
# Temporary solution
class_path = class_path.split(' / ')[-1]
if 'at-voc:' in class_path:
return ''

object_reference = _get_elem_reference(class_path, cl_dfs,
field_xpath.split('/') if not pd.isna(field_xpath) else '')
return f"?value rdf:type {object_reference} ." if object_reference else ''


def sparql_validation_generator(data: pd.DataFrame, base_xpath: str, controlled_list_dfs: dict,
prefixes_definitions: dict) -> Iterator[str]:
"""
This function generates SPARQL queries based on data in the dataframe.
:param prefixes_definitions:
:param data:
:param base_xpath:
:param controlled_list_dfs:
:return:
"""

Expand All @@ -52,10 +100,22 @@ def sparql_validation_generator(data: pd.DataFrame, base_xpath: str,
class_path = row[RULES_CLASS_PATH]
property_path = row[RULES_PROPERTY_PATH]

sparql_title = f"{sf_field_id} - {sf_field_name}"
subject_type = _generate_subject_type(class_path, controlled_list_dfs, field_xpath) \
if '?this' in property_path else ''
object_type = _generate_object_type(class_path, controlled_list_dfs, field_xpath) \
if '?value' in property_path else ''

prefixes_string = property_path
if subject_type:
prefixes_string += subject_type
if object_type:
prefixes_string += object_type

sparql_title_parts = [sf_field_id, sf_field_name]
sparql_title = " - ".join([item for item in sparql_title_parts if not pd.isnull(item)])

prefixes = []
for prefix in get_sparql_prefixes(property_path):
for prefix in get_sparql_prefixes(prefixes_string):
if prefix in prefixes_definitions:
prefix_value = prefixes_definitions.get(prefix)
else:
Expand All @@ -72,7 +132,16 @@ def sparql_validation_generator(data: pd.DataFrame, base_xpath: str,
f"The expected ontology instances are epo: {class_path} .\n" \
f"#xpath: {concat_field_xpath(base_xpath, field_xpath, separator=',')}" \
"\n" + "\n" + "\n".join(prefixes) + "\n\n" \
f"ASK WHERE {{ {property_path} }}"
f"ASK WHERE {{ \n\t\t" \
f"{subject_type}\n\t\t" \
f"{object_type}\n\t\t" \
f"{property_path} }}"


def _process_concept_mapping_sheet(sheet: pd.DataFrame) -> pd.DataFrame:
sheet.columns = sheet.iloc[0]
sheet = sheet[1:]
return sheet


def mapping_suite_processor_generate_sparql_queries(conceptual_mappings_file_path: pathlib.Path,
Expand All @@ -91,17 +160,26 @@ def mapping_suite_processor_generate_sparql_queries(conceptual_mappings_file_pat
prefixes_definitions = PREFIXES_DEFINITIONS

with open(conceptual_mappings_file_path, 'rb') as excel_file:
conceptual_mappings_rules_df = pd.read_excel(excel_file, sheet_name=CONCEPTUAL_MAPPINGS_RULES_SHEET_NAME)
conceptual_mappings_rules_df.columns = conceptual_mappings_rules_df.iloc[0]
conceptual_mappings_rules_df = conceptual_mappings_rules_df[1:]
conceptual_mappings_df = pd.read_excel(excel_file, sheet_name=None)
controlled_list_dfs = {}
for sheet_name in conceptual_mappings_df:
if sheet_name.startswith('CL'):
controlled_list_dfs[sheet_name] = _process_concept_mapping_sheet(conceptual_mappings_df[sheet_name])
conceptual_mappings_rules_df = _process_concept_mapping_sheet(
conceptual_mappings_df[CONCEPTUAL_MAPPINGS_RULES_SHEET_NAME])
conceptual_mappings_rules_df[RULES_SF_FIELD_ID].ffill(axis="index", inplace=True)
conceptual_mappings_rules_df[RULES_SF_FIELD_NAME].ffill(axis="index", inplace=True)
conceptual_mappings_rules_df = conceptual_mappings_rules_df[
conceptual_mappings_rules_df[RULES_PROPERTY_PATH].notnull()]
metadata_df = pd.read_excel(excel_file, sheet_name=CONCEPTUAL_MAPPINGS_METADATA_SHEET_NAME)
metadata_df = conceptual_mappings_df[CONCEPTUAL_MAPPINGS_METADATA_SHEET_NAME]
metadata = metadata_df.set_index('Field').T.to_dict('list')
base_xpath = metadata[BASE_XPATH_FIELD][0]
sparql_queries = sparql_validation_generator(conceptual_mappings_rules_df, base_xpath, prefixes_definitions)

sparql_queries = sparql_validation_generator(conceptual_mappings_rules_df, base_xpath, controlled_list_dfs,
prefixes_definitions)

output_sparql_queries_folder_path.mkdir(parents=True, exist_ok=True)
for index, sparql_query in enumerate(sparql_queries):
output_file_path = output_sparql_queries_folder_path / f"{rq_name}{index}.rq"
with open(output_file_path, "w") as output_file:
with open(output_file_path, "w", encoding="utf-8") as output_file:
output_file.write(sparql_query)
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from ted_sws.core.model.transform import ConceptualMapping, ConceptualMappingXPATH, ConceptualMappingMetadata
from ted_sws.mapping_suite_processor import CONCEPTUAL_MAPPINGS_METADATA_SHEET_NAME, \
CONCEPTUAL_MAPPINGS_RULES_SHEET_NAME, RULES_FIELD_XPATH, RULES_E_FORM_BT_NAME, RULES_SF_FIELD_ID, RULES_E_FORM_BT_ID
CONCEPTUAL_MAPPINGS_RULES_SHEET_NAME, RULES_FIELD_XPATH, RULES_SF_FIELD_ID, RULES_SF_FIELD_NAME
from ted_sws.notice_validator import BASE_XPATH_FIELD

CONCEPTUAL_MAPPINGS_FILE_NAME = "conceptual_mappings.xlsx"
Expand Down Expand Up @@ -44,9 +44,10 @@ def mapping_suite_read_conceptual_mapping(conceptual_mappings_file_path: pathlib
with open(conceptual_mappings_file_path, 'rb') as excel_file:
base_xpath = metadata[BASE_XPATH_FIELD][0]
rules_df = pd.read_excel(excel_file, sheet_name=CONCEPTUAL_MAPPINGS_RULES_SHEET_NAME, header=1)
rules_df[RULES_SF_FIELD_ID].ffill(axis="index", inplace=True)
rules_df[RULES_SF_FIELD_NAME].ffill(axis="index", inplace=True)
df_xpaths = rules_df[RULES_FIELD_XPATH].tolist()
df_bt_names = rules_df[RULES_E_FORM_BT_NAME].tolist()
df_eform_bt_ids = rules_df[RULES_E_FORM_BT_ID].tolist()
df_sform_field_names = rules_df[RULES_SF_FIELD_NAME].tolist()
df_sform_field_ids = rules_df[RULES_SF_FIELD_ID].tolist()
processed_xpaths = set()
for idx, xpath_row in enumerate(df_xpaths):
Expand All @@ -56,15 +57,10 @@ def mapping_suite_read_conceptual_mapping(conceptual_mappings_file_path: pathlib
if xpath:
xpath = base_xpath + "/" + xpath
if xpath not in processed_xpaths:
xpath_name = df_bt_names[idx] if df_bt_names[idx] is not np.nan else None
eform_bt_id = df_eform_bt_ids[idx] if df_eform_bt_ids[idx] is not np.nan else None
sform_field_id = df_sform_field_ids[idx] if df_sform_field_ids[idx] is not np.nan else None

form_fields = [df_sform_field_ids[idx], df_sform_field_names[idx]]
cm_xpath: ConceptualMappingXPATH = ConceptualMappingXPATH(
xpath=xpath,
name=xpath_name,
standard_form_field_id=sform_field_id,
eform_bt_id=eform_bt_id
form_field=" - ".join([item for item in form_fields if not pd.isnull(item)])
)
conceptual_mapping_xpaths.append(cm_xpath)
processed_xpaths.add(xpath)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ def notice_sparql_summary(self, notice: Notice, report: RDFManifestationValidati
if validation.result == SPARQLQueryRefinedResultType.VALID.value:
report_count.valid += 1
result_count.valid += 1
if validation.result == SPARQLQueryRefinedResultType.UNVERIFIABLE.value:
report_count.unverifiable += 1
result_count.unverifiable += 1
elif validation.result == SPARQLQueryRefinedResultType.INVALID.value:
report_count.invalid += 1
result_count.invalid += 1
Expand Down
6 changes: 2 additions & 4 deletions ted_sws/notice_validator/adapters/xpath_coverage_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,8 @@ def xpath_assertions(self, notice_xpaths: XPATH_TYPE,
for xpath in self.conceptual_xpaths:
xpath_assertion = XPATHCoverageValidationAssertion()
xpath_data = self.conceptual_xpath_data[xpath]
title = xpath_data.name
xpath_assertion.title = title if title is not np.nan else ''
xpath_assertion.standard_form_field_id = xpath_data.standard_form_field_id
xpath_assertion.eform_bt_id = xpath_data.eform_bt_id
form_field = xpath_data.form_field
xpath_assertion.form_field = form_field if form_field is not np.nan else ''
xpath_assertion.xpath = xpath
xpath_assertion.count = xpaths_list.count(xpath)
xpath_assertion.notice_hit = self.find_notice_by_xpath(notice_xpaths, xpath)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
table thead th {
text-align: left;
}
.success, .valid {
.success, .valid, .unverifiable {
color: #3c763d;
}
.info {
Expand Down Expand Up @@ -67,7 +67,7 @@
<table class="display">
<thead class="center aligned">
<tr>
<th>Title</th>
<th>Form Field</th>
<th>Description</th>
<th>Query content</th>
<th>Result</th>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,7 @@
<table class="display" data-order='[[3, "asc"]]'>
<thead>
<tr>
<th>Standard Form Field ID</th>
<th>eForm BT-ID</th>
<th>Title</th>
<th>Form Field</th>
<th>XPATH</th>
<th>Found</th>
<th>Notice count</th>
Expand All @@ -115,9 +113,7 @@
<tbody>
{% for item in validation_result.xpath_assertions %}
<tr>
<td>{{ item.standard_form_field_id if item.standard_form_field_id is not none else '' }}</td>
<td>{{ item.eform_bt_id if item.eform_bt_id is not none else '' }}</td>
<td>{{ item.title if item.title is not none else '' }}</td>
<td>{{ item.form_field if item.form_field is not none else '' }}</td>
<td>{{ item.xpath }}</td>
<td class="{% if item.query_result %}success{% else %}error{% endif %}">{{ item.query_result }}</td>
<td>{{ item.count }}</td>
Expand Down
5 changes: 3 additions & 2 deletions ted_sws/notice_validator/services/sparql_test_suite_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,10 @@ def _process_sparql_ask_result(self, query_result, sparql_query: SPARQLQuery,
if ask_answer and sparql_query_result.fields_covered:
result = SPARQLQueryRefinedResultType.VALID
elif not ask_answer and not sparql_query_result.fields_covered:
result = SPARQLQueryRefinedResultType.UNVERIFIABLE
elif ask_answer and not sparql_query_result.fields_covered:
result = SPARQLQueryRefinedResultType.WARNING
elif (not ask_answer and sparql_query_result.fields_covered) or (
ask_answer and not sparql_query_result.fields_covered):
elif not ask_answer and sparql_query_result.fields_covered:
result = SPARQLQueryRefinedResultType.INVALID

sparql_query_result.result = result
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@
def test_mapping_suite_processor_generate_sparql_queries(caplog, fake_mapping_suite_id, file_system_repository_path):
with tempfile.TemporaryDirectory() as temp_folder:
temp_mapping_suite_path = Path(temp_folder)
shutil.copytree(file_system_repository_path, temp_mapping_suite_path,
dirs_exist_ok=True)
shutil.copytree(file_system_repository_path, temp_mapping_suite_path, dirs_exist_ok=True)

conceptual_mappings_file_path = Path(CONCEPTUAL_MAPPINGS_FILE_TEMPLATE.format(
mappings_path=temp_mapping_suite_path,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
from tests import temporary_copy
import os
import json

from ted_sws.mapping_suite_processor.services.conceptual_mapping_generate_sparql_queries import \
mapping_suite_processor_generate_sparql_queries
import pathlib

def test_mapping_suite_processor_upload_in_mongodb(file_system_repository_path, mongodb_client):
with temporary_copy(file_system_repository_path) as tmp_mapping_suite_package_path:
Expand Down
Loading