Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

integrate TED-API v3 and add eForms sample data #518

Merged
merged 7 commits into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 9 additions & 34 deletions ted_sws/core/model/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@

""" """
from enum import Enum
from typing import List, Optional
from typing import List, Optional, Union

from pydantic import Field
from pydantic import Field, validator
from pydantic.annotated_types import NamedTuple

from ted_sws.core.model import PropertyBaseModel
Expand Down Expand Up @@ -124,39 +124,14 @@ class NormalisedMetadataView(Metadata):
eform_sdk_version: Optional[str]



class TEDMetadata(Metadata):
"""
Stores notice original metadata
"""
AA: List[str] = None
AC: str = None
CY: List[str] = None
DD: str = None
DI: str = None
DS: str = None
DT: List[str] = None
MA: List[str] = None
NC: List[str] = None
ND: str = None
NL: str = None
OC: List[str] = None
OJ: str = None
OL: str = None
OY: List[str] = None
PC: List[str] = None
PD: str = None
PR: str = None
RC: List[str] = None
RN: List[str] = None
RP: str = None
TD: str = None
TVH: str = None
TVL: str = None
TY: str = None
award_criterion_type: str = Field(default=None, alias='award-criterion-type')
corporate_body: List[str] = Field(default=None, alias='corporate-body')
funding: List[str] = None
notice_identifier: str = Field(default=None, alias='notice-identifier')
notice_type: str = Field(default=None, alias='notice-type')
notice_version: str = Field(default=None, alias='notice-version')
ND: Optional[str] = None
PD: Optional[str] = None
# ------------------------------------------------------------------
# Note: In TED-API v3 this field is str, in past was list
# ------------------------------------------------------------------
RN: Optional[Union[List[str], str]] = None
# ------------------------------------------------------------------
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,14 @@ def deduplicate_procedure_entities(notices: List[Notice], procedure_cet_uri: str
notice_families = defaultdict(list)
for notice in notices:
if notice.original_metadata and notice.original_metadata.RN:
parent_notice_id = notice.original_metadata.RN[0]
parent_notice_id_field = notice.original_metadata.RN
# ------------------------------------------------------------------
# Note: This logic is added to be back compatible with old TED-API data format.
# ------------------------------------------------------------------
if isinstance(parent_notice_id_field, list):
parent_notice_id_field = parent_notice_id_field[0]
# ------------------------------------------------------------------
parent_notice_id = parent_notice_id_field
parent_notice_id = f"{parent_notice_id[4:]}-{parent_notice_id[:4]}"
notice_families[parent_notice_id].append(notice)

Expand Down
115 changes: 73 additions & 42 deletions ted_sws/notice_fetcher/adapters/ted_api.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,31 @@
import base64
import json
import pathlib
from datetime import date
from typing import List
from typing import List, Generator

import requests

from ted_sws import config
from ted_sws.event_manager.services.log import log_warning
from ted_sws.notice_fetcher.adapters.ted_api_abc import TedAPIAdapterABC, RequestAPI

DEFAULT_TED_API_QUERY_RESULT_SIZE = {"pageSize": 100,
"pageNum": 1,
"scope": 3
DEFAULT_TED_API_QUERY_RESULT_SIZE = {"limit": 100,
"page": 1,
"scope": "ALL",
}

DEFAULT_TED_API_QUERY_RESULT_FIELDS = {"fields": ["AA", "AC", "CY", "DD", "DI", "DS", "TVL", "TY",
"DT", "MA", "NC", "ND", "OC", "OJ", "OL", "OY",
"PC", "PD", "PR", "RC", "RN", "RP", "TD", "TVH",
"CONTENT",
# INFO: This query result fields is not supported correctly by TED-API.
#"notice-type", "award-criterion-type", "corporate-body",
#"funding", "notice-identifier", "notice-version"
]}

TOTAL_DOCUMENTS_NUMBER = "total"
RESPONSE_RESULTS = "results"
DEFAULT_TED_API_QUERY_RESULT_FIELDS = {"fields": ["ND", "PD", "RN"]}

TOTAL_DOCUMENTS_NUMBER = "totalNoticeCount"
RESPONSE_RESULTS = "notices"
DOCUMENT_CONTENT = "content"
RESULT_PAGE_NUMBER = "pageNum"
RESULT_PAGE_NUMBER = "page"
TED_API_FIELDS = "fields"
DOCUMENT_CONTENT_FIELD = "CONTENT"
LINKS_TO_CONTENT_KEY = "links"
XML_CONTENT_KEY = "xml"
MULTIPLE_LANGUAGE_CONTENT_KEY = "MUL"
ENGLISH_LANGUAGE_CONTENT_KEY = "ENG"
DOCUMENT_NOTICE_ID_KEY = "ND"


class TedRequestAPI(RequestAPI):
Expand All @@ -40,13 +38,12 @@ def __call__(self, api_url: str, api_query: dict) -> dict:
:return: dict
"""

response = requests.get(api_url, params=api_query)
response = requests.post(api_url, json=api_query)
if response.ok:
response_content = json.loads(response.text)
return response_content
else:
raise Exception(f"The TED-API call failed with: {response}")

raise Exception(f"The TED-API call failed with: {response}, {response.content}, {api_url}")


class TedAPIAdapter(TedAPIAdapterABC):
Expand All @@ -71,7 +68,7 @@ def get_by_wildcard_date(self, wildcard_date: str) -> List[dict]:
:return: List[str]
"""

query = {"q": f"PD=[{wildcard_date}]"}
query = {"query": f"PD={wildcard_date}"}

return self.get_by_query(query=query)

Expand All @@ -83,48 +80,82 @@ def get_by_range_date(self, start_date: date, end_date: date) -> List[dict]:
:return:List[str]
"""

date_filter = f">={start_date.strftime('%Y%m%d')} AND <={end_date.strftime('%Y%m%d')}"
date_filter = f"PD>={start_date.strftime('%Y%m%d')} AND PD<={end_date.strftime('%Y%m%d')}"

query = {"q": f"PD=[{date_filter}]"}
query = {"query": date_filter}

return self.get_by_query(query=query)

def get_by_query(self, query: dict, result_fields: dict = None) -> List[dict]:
def _retrieve_document_content(self, document_content: dict) -> str:
"""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a test for this function?

Method to retrieve a document content from the TedApi API
:param document_content:
:return:str '
"""
xml_links = document_content[LINKS_TO_CONTENT_KEY][XML_CONTENT_KEY]
language_key = MULTIPLE_LANGUAGE_CONTENT_KEY
if language_key not in xml_links.keys():
if ENGLISH_LANGUAGE_CONTENT_KEY in xml_links.keys():
language_key = ENGLISH_LANGUAGE_CONTENT_KEY
else:
language_key = xml_links.keys()[0]

log_warning(
f"Language key {MULTIPLE_LANGUAGE_CONTENT_KEY} not found in {document_content[DOCUMENT_NOTICE_ID_KEY]},"
f" and will be used language key {language_key}!")

xml_document_content_link = xml_links[language_key]
response = requests.get(xml_document_content_link)

if response.ok:
return response.text
else:
raise Exception(f"The notice content can't be loaded!: {response}, {response.content}")

def get_generator_by_query(self, query: dict, result_fields: dict = None) -> Generator[dict, None, None]:
"""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a test for this function?

Method to get a documents content by passing a query to the API (json)
:param query:
:param result_fields:
:return:List[str]
:return:Generator[dict]
"""
query.update(DEFAULT_TED_API_QUERY_RESULT_SIZE)
query.update(result_fields or DEFAULT_TED_API_QUERY_RESULT_FIELDS)
response_body = self.request_api(api_url=self.ted_api_url, api_query=query)

documents_number = response_body[TOTAL_DOCUMENTS_NUMBER]
result_pages = 1 + int(documents_number) // 100
documents_content = response_body[RESPONSE_RESULTS]

for page_number in range(2, result_pages + 1):
query[RESULT_PAGE_NUMBER] = page_number
response_body = self.request_api(api_url=self.ted_api_url, api_query=query)
documents_content += response_body[RESPONSE_RESULTS]
if DOCUMENT_CONTENT_FIELD in query[TED_API_FIELDS]:
decoded_documents_content = []
for document_content in documents_content:
document_content[DOCUMENT_CONTENT] = base64.b64decode(document_content[DOCUMENT_CONTENT]).decode(
encoding="utf-8")
decoded_documents_content.append(document_content)
return decoded_documents_content
if result_pages > 1:
for page_number in range(2, result_pages + 1):
query[RESULT_PAGE_NUMBER] = page_number
response_body = self.request_api(api_url=self.ted_api_url, api_query=query)
documents_content += response_body[RESPONSE_RESULTS]
for document_content in documents_content:
document_content[DOCUMENT_CONTENT] = self._retrieve_document_content(document_content)
del document_content[LINKS_TO_CONTENT_KEY]
yield document_content
else:
return documents_content
for document_content in documents_content:
document_content[DOCUMENT_CONTENT] = self._retrieve_document_content(document_content)
del document_content[LINKS_TO_CONTENT_KEY]
yield document_content

def get_by_query(self, query: dict, result_fields: dict = None) -> List[dict]:
"""
Method to get a documents content by passing a query to the API (json)
:param query:
:param result_fields:
:return:List[dict]
"""
return list(self.get_generator_by_query(query=query, result_fields=result_fields))

def get_by_id(self, document_id: str) -> dict:
"""
Method to get a document content by passing an ID
:param document_id:
:return: str
:return: dict
"""

query = {"q": f"ND=[{document_id}]"}
query = {"query": f"ND={document_id}"}

return self.get_by_query(query=query)[0]
31 changes: 31 additions & 0 deletions tests/e2e/notice_fetcher/_test_generate_eforms_sample_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import pathlib

TED_API_EFORMS_QUERY = """
TD NOT IN (C E G I D P M Q O R 0 1 2 3 4 5 6 7 8 9 B S Y V F A H J K) AND
notice-subtype IN ({eforms_subtype}) AND
FT~"eforms-sdk-{eforms_sdk_version}"
"""

EFORMS_SUBTYPES = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
EFORMS_SDK_VERSIONS = [f"1.{version}" for version in range(3, 11)]


def _test_generate_eforms_sample_dataset(ted_document_search):
results_path = pathlib.Path(__file__).parent / "eforms_samples"

for eforms_sdk_version in EFORMS_SDK_VERSIONS:
for eforms_subtype in EFORMS_SUBTYPES:
results_dir_path = results_path / f"eforms_sdk_v{eforms_sdk_version}" / f"eform_subtype_{eforms_subtype}"

print(f"Load for {results_dir_path}")
query = {"query": TED_API_EFORMS_QUERY.format(eforms_sdk_version=eforms_sdk_version,
eforms_subtype=eforms_subtype)}
print(query)
notices = ted_document_search.get_generator_by_query(query=query)
for sample_id in range(1, 2):
notice = next(notices, None)
if notice is None:
break
results_dir_path.mkdir(parents=True, exist_ok=True)
result_notice_xml_path = results_dir_path / f"{notice['ND']}.xml"
result_notice_xml_path.write_text(notice["content"], encoding="utf-8")
5 changes: 3 additions & 2 deletions tests/e2e/notice_fetcher/test_notice_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@


def test_notice_fetcher_by_identifier(notice_repository, ted_document_search):
document_id = "067623-2022"
document_id = "67623-2022"
NoticeFetcher(notice_repository=notice_repository, ted_api_adapter=ted_document_search).fetch_notice_by_id(
document_id=document_id)
notice = notice_repository.get(reference=document_id)
assert notice is not None
assert isinstance(notice, Notice)
assert notice
assert notice.original_metadata
Expand All @@ -18,7 +19,7 @@ def test_notice_fetcher_by_identifier(notice_repository, ted_document_search):


def test_notice_fetcher_by_search_query(notice_repository, ted_document_search):
query = {"q": "ND=[67623-2022]"}
query = {"query": "ND=67623-2022"}

NoticeFetcher(notice_repository=notice_repository, ted_api_adapter=ted_document_search).fetch_notices_by_query(
query=query)
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/notice_fetcher/test_ted_request_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

def test_ted_request_api():
ted_api_request = TedRequestAPI()
notice_by_query = ted_api_request(api_url=config.TED_API_URL, api_query={"q": "ND=[67623-2022]"})
notice_by_query = ted_api_request(api_url=config.TED_API_URL, api_query={"query": "ND=[67623-2022]"})
assert notice_by_query
assert isinstance(notice_by_query, dict)
with pytest.raises(Exception) as e:
Expand Down
10 changes: 5 additions & 5 deletions tests/fakes/fake_ted_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


def get_fake_api_response() -> dict:
path = TEST_DATA_PATH / "notices" / "2021-OJS237-623049.json"
path = TEST_DATA_PATH / "notice_fetcher" / "ted_api_response" / "ted_api_response.json"
return json.loads(path.read_text())


Expand Down Expand Up @@ -38,15 +38,15 @@ def get_by_wildcard_date(self, wildcard_date: str) -> List[dict]:
:param wildcard_date:
:return:
"""
return [notice_data for notice_data in get_fake_api_response()["results"]]
return [notice_data for notice_data in get_fake_api_response()["notices"]]

def get_by_id(self, document_id: str) -> dict:
"""

:param document_id:
:return:
"""
return get_fake_api_response()["results"][0]
return get_fake_api_response()["notices"][0]

def get_by_range_date(self, start_date: date, end_date: date) -> List[dict]:
"""
Expand All @@ -55,7 +55,7 @@ def get_by_range_date(self, start_date: date, end_date: date) -> List[dict]:
:param end_date:
:return:
"""
return [notice_data for notice_data in get_fake_api_response()["results"]]
return [notice_data for notice_data in get_fake_api_response()["notices"]]

def get_by_query(self, query: dict, result_fields: dict = None) -> List[dict]:
"""
Expand All @@ -64,4 +64,4 @@ def get_by_query(self, query: dict, result_fields: dict = None) -> List[dict]:
:param result_fields:
:return:
"""
return [notice_data for notice_data in get_fake_api_response()["results"]]
return [notice_data for notice_data in get_fake_api_response()["notices"]]
Loading
Loading