From 69e6c59952cd39bd94183f40b357730fb9295a5d Mon Sep 17 00:00:00 2001 From: CaptainOfHacks Date: Wed, 18 Jan 2023 16:50:14 +0200 Subject: [PATCH] add Cellar request dellay --- .../data_manager/adapters/sparql_endpoint.py | 11 ++++++---- .../resources/__init__.py | 8 +++---- .../notice_validator/resources/__init__.py | 12 +++++----- .../check_notice_availability.rq | 4 ++++ .../check_notices_availability.rq | 5 +++++ .../check_availability_of_notice_in_cellar.py | 22 +++++++++++++------ ..._check_availability_of_notice_in_cellar.py | 10 ++++++++- 7 files changed, 49 insertions(+), 23 deletions(-) create mode 100644 ted_sws/notice_validator/resources/sparql_query_templates/check_notice_availability.rq create mode 100644 ted_sws/notice_validator/resources/sparql_query_templates/check_notices_availability.rq diff --git a/ted_sws/data_manager/adapters/sparql_endpoint.py b/ted_sws/data_manager/adapters/sparql_endpoint.py index 263942d20..296dd982a 100644 --- a/ted_sws/data_manager/adapters/sparql_endpoint.py +++ b/ted_sws/data_manager/adapters/sparql_endpoint.py @@ -14,7 +14,7 @@ import pandas as pd import rdflib -from SPARQLWrapper import SPARQLWrapper, CSV, JSON, RDF +from SPARQLWrapper import SPARQLWrapper, CSV, JSON, RDF, POST from ted_sws import config @@ -35,13 +35,15 @@ class SPARQLClientPool(object): connection_pool = {} @staticmethod - def create_or_reuse_connection(endpoint_url: str, user: str, password: str): + def create_or_reuse_connection(endpoint_url: str, user: str, password: str, use_post_method: bool = False): if endpoint_url not in SPARQLClientPool.connection_pool: sparql_wrapper = SPARQLWrapper(endpoint_url) sparql_wrapper.setCredentials( user=user, passwd=password ) + if use_post_method: + sparql_wrapper.setMethod(method=POST) SPARQLClientPool.connection_pool[endpoint_url] = sparql_wrapper return SPARQLClientPool.connection_pool[endpoint_url] @@ -123,10 +125,11 @@ def add_data_to_repository(self, file_content, repository_name, mime_type): class SPARQLTripleStoreEndpoint(TripleStoreEndpointABC): - def __init__(self, endpoint_url: str, user: str = None, password: str = None): + def __init__(self, endpoint_url: str, user: str = None, password: str = None, use_post_method: bool = False): user = user if user else config.AGRAPH_SUPER_USER password = password if password else config.AGRAPH_SUPER_PASSWORD - self.endpoint = SPARQLClientPool.create_or_reuse_connection(endpoint_url, user, password) + self.endpoint = SPARQLClientPool.create_or_reuse_connection(endpoint_url, user, password, + use_post_method=use_post_method) def _set_sparql_query(self, sparql_query: str): """ diff --git a/ted_sws/master_data_registry/resources/__init__.py b/ted_sws/master_data_registry/resources/__init__.py index d1b170eef..e45ecf34c 100644 --- a/ted_sws/master_data_registry/resources/__init__.py +++ b/ted_sws/master_data_registry/resources/__init__.py @@ -1,7 +1,7 @@ import pathlib MASTER_DATA_REGISTRY_RESOURCES_PATH = pathlib.Path(__file__).parent.resolve() - -TRIPLES_BY_CET_URI_SPARQL_QUERY_TEMPLATE_PATH = MASTER_DATA_REGISTRY_RESOURCES_PATH / "sparql_query_templates/get_by_cet_uri.rq" -PROCEDURE_SUBJECTS_SPARQL_QUERY_TEMPLATE_PATH = MASTER_DATA_REGISTRY_RESOURCES_PATH / "sparql_query_templates/get_procedure_uris.rq" -RDF_FRAGMENT_BY_URI_SPARQL_QUERY_TEMPLATE_PATH = MASTER_DATA_REGISTRY_RESOURCES_PATH / "sparql_query_templates/get_2_dependency_levels_for_a_uri_as_root.rq" +SPARQL_QUERY_TEMPLATES_PATH = MASTER_DATA_REGISTRY_RESOURCES_PATH / "sparql_query_templates" +TRIPLES_BY_CET_URI_SPARQL_QUERY_TEMPLATE_PATH = SPARQL_QUERY_TEMPLATES_PATH / "get_by_cet_uri.rq" +PROCEDURE_SUBJECTS_SPARQL_QUERY_TEMPLATE_PATH = SPARQL_QUERY_TEMPLATES_PATH / "get_procedure_uris.rq" +RDF_FRAGMENT_BY_URI_SPARQL_QUERY_TEMPLATE_PATH = SPARQL_QUERY_TEMPLATES_PATH / "get_2_dependency_levels_for_a_uri_as_root.rq" diff --git a/ted_sws/notice_validator/resources/__init__.py b/ted_sws/notice_validator/resources/__init__.py index 465232acc..c9aea559b 100644 --- a/ted_sws/notice_validator/resources/__init__.py +++ b/ted_sws/notice_validator/resources/__init__.py @@ -1,8 +1,6 @@ -#!/usr/bin/python3 +import pathlib -# __init__.py -# Date: 11/02/2022 -# Author: Eugeniu Costetchi -# Email: costezki.eugen@gmail.com - -""" """ +NOTICE_VALIDATOR_RESOURCES_PATH = pathlib.Path(__file__).parent.resolve() +SPARQL_QUERY_TEMPLATES_PATH = NOTICE_VALIDATOR_RESOURCES_PATH / "sparql_query_templates" +NOTICE_AVAILABILITY_SPARQL_QUERY_TEMPLATE_PATH = SPARQL_QUERY_TEMPLATES_PATH / "check_notice_availability.rq" +NOTICES_AVAILABILITY_SPARQL_QUERY_TEMPLATE_PATH = SPARQL_QUERY_TEMPLATES_PATH / "check_notices_availability.rq" \ No newline at end of file diff --git a/ted_sws/notice_validator/resources/sparql_query_templates/check_notice_availability.rq b/ted_sws/notice_validator/resources/sparql_query_templates/check_notice_availability.rq new file mode 100644 index 000000000..506407f02 --- /dev/null +++ b/ted_sws/notice_validator/resources/sparql_query_templates/check_notice_availability.rq @@ -0,0 +1,4 @@ +ASK {{ +VALUES ?instance {{<{notice_uri}>}} +?instance ?predicate [] . +}} \ No newline at end of file diff --git a/ted_sws/notice_validator/resources/sparql_query_templates/check_notices_availability.rq b/ted_sws/notice_validator/resources/sparql_query_templates/check_notices_availability.rq new file mode 100644 index 000000000..6168d4096 --- /dev/null +++ b/ted_sws/notice_validator/resources/sparql_query_templates/check_notices_availability.rq @@ -0,0 +1,5 @@ +select distinct ?s +{{ +VALUES ?s {{{notice_uries}}} +?s ?p ?o . +}} diff --git a/ted_sws/notice_validator/services/check_availability_of_notice_in_cellar.py b/ted_sws/notice_validator/services/check_availability_of_notice_in_cellar.py index 189df592d..67a9c4ead 100644 --- a/ted_sws/notice_validator/services/check_availability_of_notice_in_cellar.py +++ b/ted_sws/notice_validator/services/check_availability_of_notice_in_cellar.py @@ -1,3 +1,4 @@ +import time from typing import List, Set from pymongo import MongoClient @@ -5,13 +6,14 @@ from ted_sws.core.service.batch_processing import chunks from ted_sws.data_manager.adapters.notice_repository import NoticeRepository from ted_sws.data_manager.adapters.sparql_endpoint import SPARQLTripleStoreEndpoint +from ted_sws.notice_validator.resources import NOTICE_AVAILABILITY_SPARQL_QUERY_TEMPLATE_PATH, \ + NOTICES_AVAILABILITY_SPARQL_QUERY_TEMPLATE_PATH WEBAPI_SPARQL_URL = "https://publications.europa.eu/webapi/rdf/sparql" -CELLAR_NOTICE_AVAILABILITY_QUERY = "ASK {{ VALUES ?instance {{<{notice_uri}>}} ?instance ?predicate [] . }}" -CELLAR_NOTICES_AVAILABILITY_QUERY = "select distinct ?s {{VALUES ?s {{$notice_uries}} ?s ?p ?o . }}" WEBAPI_SPARQL_RUN_FORMAT = "application/sparql-results+json" INVALID_NOTICE_URI = 'https://www.w3.org/1999/02/22-rdf-syntax-ns#type-invalid' -DEFAULT_NOTICES_BATCH_SIZE = 1000 +DEFAULT_NOTICES_BATCH_SIZE = 5000 +DEFAULT_CELLAR_REQUEST_DELAY = 3 def check_availability_of_notice_in_cellar(notice_uri: str, endpoint_url: str = WEBAPI_SPARQL_URL) -> bool: @@ -21,7 +23,8 @@ def check_availability_of_notice_in_cellar(notice_uri: str, endpoint_url: str = :param endpoint_url: :return: """ - query = CELLAR_NOTICE_AVAILABILITY_QUERY.format(notice_uri=notice_uri) + query_template = NOTICE_AVAILABILITY_SPARQL_QUERY_TEMPLATE_PATH.read_text(encoding="utf-8") + query = query_template.format(notice_uri=notice_uri) result = SPARQLTripleStoreEndpoint(endpoint_url=endpoint_url).with_query(sparql_query=query).fetch_tree() return result['boolean'] @@ -33,9 +36,11 @@ def check_availability_of_notices_in_cellar(notice_uries: List[str], endpoint_ur :param endpoint_url: :return: """ + query_template = NOTICES_AVAILABILITY_SPARQL_QUERY_TEMPLATE_PATH.read_text(encoding="utf-8") notice_uries = " ".join([f"<{notice_uri}>" for notice_uri in notice_uries]) - query = CELLAR_NOTICE_AVAILABILITY_QUERY.format(notice_uri=notice_uries) - result = SPARQLTripleStoreEndpoint(endpoint_url=endpoint_url).with_query(sparql_query=query).fetch_tabular() + query = query_template.format(notice_uries=notice_uries) + result = SPARQLTripleStoreEndpoint(endpoint_url=endpoint_url, + use_post_method=True).with_query(sparql_query=query).fetch_tabular() return set(result['s'].to_list()) @@ -66,11 +71,13 @@ def validate_notice_availability_in_cellar(notice: Notice, notice_uri: str = Non return notice -def validate_notices_availability_in_cellar(notice_statuses: List[NoticeStatus], mongodb_client: MongoClient): +def validate_notices_availability_in_cellar(notice_statuses: List[NoticeStatus], mongodb_client: MongoClient, + cellar_request_delay_in_seconds: int = DEFAULT_CELLAR_REQUEST_DELAY): """ This function validate availability in cellar foreach notice from notices with a notice_status in notice_statuses. :param notice_statuses: :param mongodb_client: + :param cellar_request_delay_in_seconds: :return: """ notice_repository = NoticeRepository(mongodb_client=mongodb_client) @@ -90,3 +97,4 @@ def validate_notices_availability_in_cellar(notice_statuses: List[NoticeStatus], else: notice.update_status_to(new_status=NoticeStatus.PUBLICLY_UNAVAILABLE) notice_repository.update(notice=notice) + time.sleep(cellar_request_delay_in_seconds) diff --git a/tests/e2e/notice_validator/test_check_availability_of_notice_in_cellar.py b/tests/e2e/notice_validator/test_check_availability_of_notice_in_cellar.py index 98ec3d907..2f2fbce54 100644 --- a/tests/e2e/notice_validator/test_check_availability_of_notice_in_cellar.py +++ b/tests/e2e/notice_validator/test_check_availability_of_notice_in_cellar.py @@ -1,6 +1,7 @@ from ted_sws.core.model.notice import NoticeStatus from ted_sws.notice_validator.services.check_availability_of_notice_in_cellar import \ - check_availability_of_notice_in_cellar, validate_notice_availability_in_cellar + check_availability_of_notice_in_cellar, validate_notice_availability_in_cellar, \ + check_availability_of_notices_in_cellar, DEFAULT_NOTICES_BATCH_SIZE def test_check_availability_of_notice_in_cellar(valid_cellar_uri, invalid_cellar_uri): @@ -19,3 +20,10 @@ def test_validate_notice_availability_in_cellar(fake_notice_F03, valid_cellar_ur fake_notice_F03._status = NoticeStatus.PUBLISHED validate_notice_availability_in_cellar(notice=fake_notice_F03, notice_uri=invalid_cellar_uri) assert fake_notice_F03.status == NoticeStatus.PUBLICLY_UNAVAILABLE + + +def test_validate_notices_availability_in_cellar(valid_cellar_uri, invalid_cellar_uri): + notice_uries = [valid_cellar_uri] * DEFAULT_NOTICES_BATCH_SIZE + [invalid_cellar_uri] + available_uries = check_availability_of_notices_in_cellar(notice_uries=notice_uries) + assert valid_cellar_uri in available_uries + assert invalid_cellar_uri not in available_uries