Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

normaliser changes #242

Merged
merged 1 commit into from
Sep 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions ted_sws/notice_metadata_processor/services/metadata_normalizer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import abc
import re
from datetime import datetime
from typing import Dict, Tuple, List

Expand Down Expand Up @@ -137,12 +138,22 @@ def normalise_legal_basis_value(cls, value: str) -> str:
@classmethod
def normalise_form_number(cls, value: str) -> str:
"""
Normalise form number to be F{number} format
Normalise form number to be F{number} format.
##Decided to keep normalisation of the input data
Rules:
* The form number should start with a letter ("F", "T")
* The form number isn't always a number (CEI,EEIG)
* If the number is between 1 - 9 then it must have 0 as prefix (F02 not F2)
:param value:
:return:
"""
if value and not value.startswith("F") and not value[0].isalpha():
return "F" + value
form_number_parts = re.split(r"(?=\d)", value, 1)
if len(form_number_parts) == 2:
text_part: str = form_number_parts[0] if form_number_parts[0] else "F"
number_part: str = form_number_parts[1]
if text_part.isalpha() and number_part.isdecimal():
number_part = "0" + number_part if number_part and len(number_part) < 2 else number_part
return text_part + number_part
return value

@classmethod
Expand All @@ -168,7 +179,8 @@ def get_filter_variables_values(cls, form_number: str, extracted_notice_type: st
filter_map.query(f"{FORM_NUMBER_KEY}=='{variables[FORM_NUMBER_KEY]}'").to_dict(orient='records')[0]
except:
raise Exception(
f"This notice doesn't have a form number or the extracted form number is not in the mapping. Form number found is {form_number}")
f"This notice doesn't have a form number or the extracted form number is not in the mapping. "
f"Form number found is {form_number}, document code is {document_type_code} and legal basis is {legal_basis}")

for key, value in filter_variables.items():
if value == 0:
Expand Down
3 changes: 0 additions & 3 deletions ted_sws/resources/mapping_files/eforms_mapping.csv
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,5 @@ result,can-desg,32014L0025,37
cont-modif,can-modif,32014L0024,38
cont-modif,can-modif,32014L0025,39
cont-modif,can-modif,32014L0023,40
change,corr,32014L0024,41
change,corr,32014L0025,41
change,corr,32014L0023,41
planning,pin-tran,32007R1370,T01
result,can-tran,32007R1370,T02
3 changes: 2 additions & 1 deletion ted_sws/resources/mapping_files/sforms_mapping.csv
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ F12,,32014L0024,,23
F12,,32014L0025,,24
F13,,32014L0024,,36
F13,,32014L0025,,37
F14,,,,41
F15,,32009L0081,,27
F15,,32014L0023,,28
F15,,32014L0024,,25
Expand All @@ -34,10 +33,12 @@ F20,,32014L0025,,39
F20,,32018R1046,,38
F21,AWARD_CONTRACT,,,33
F21,CONTRACT,,,20
F21,PRI_ONLY,,,4
F21,PRI_CALL_COMPETITION,,,12
F22,AWARD_CONTRACT,,,34
F22,CONTRACT,,,21
F22,PER_CALL_COMPETITION,,,13
F22,PER_ONLY,,,5
F22,QSU_ONLY,,,15
F23,,,,35
F24,,,,19
Expand Down
38 changes: 23 additions & 15 deletions tests/unit/notice_metadata_processor/test_metadata_normaliser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from ted_sws.notice_metadata_processor.services.metadata_normalizer import normalise_notice, normalise_notice_by_id, \
MetadataNormaliser, ExtractedMetadataNormaliser, FORM_NUMBER_KEY, SF_NOTICE_TYPE_KEY, LEGAL_BASIS_KEY, \
DOCUMENT_CODE_KEY
from ted_sws.notice_metadata_processor.services.xml_manifestation_metadata_extractor import XMLManifestationMetadataExtractor
from ted_sws.notice_metadata_processor.services.xml_manifestation_metadata_extractor import \
XMLManifestationMetadataExtractor


def test_metadata_normaliser_by_notice(indexed_notice):
Expand Down Expand Up @@ -40,29 +41,37 @@ def test_metadata_normaliser(indexed_notice):


def test_normalise_form_number(indexed_notice):
extracted_metadata = XMLManifestationMetadataExtractor(xml_manifestation=indexed_notice.xml_manifestation).to_metadata()
extracted_metadata = XMLManifestationMetadataExtractor(
xml_manifestation=indexed_notice.xml_manifestation).to_metadata()
extracted_metadata_normaliser = ExtractedMetadataNormaliser(extracted_metadata=extracted_metadata)
assert "18" == extracted_metadata.extracted_form_number
assert "F18" == extracted_metadata_normaliser.normalise_form_number(value=extracted_metadata.extracted_form_number)
assert "T01" == extracted_metadata_normaliser.normalise_form_number(value="T01")
assert None == extracted_metadata_normaliser.normalise_form_number(value=None)
assert extracted_metadata_normaliser.normalise_form_number("FFFSA") == "FFFSA"
assert extracted_metadata_normaliser.normalise_form_number("F18") == "F18"
assert extracted_metadata_normaliser.normalise_form_number("F01") == "F01"
assert extracted_metadata_normaliser.normalise_form_number("F2") == "F02"
assert extracted_metadata_normaliser.normalise_form_number("22") == "F22"
assert extracted_metadata_normaliser.normalise_form_number("2") == "F02"
assert extracted_metadata_normaliser.normalise_form_number("F") == "F"
assert extracted_metadata_normaliser.normalise_form_number("TX01") == "TX01"
assert extracted_metadata_normaliser.normalise_form_number("TX1") == "TX01"
assert extracted_metadata_normaliser.normalise_form_number("FX03FG") == "FX03FG"
assert extracted_metadata_normaliser.normalise_form_number("1F03FG") == "1F03FG"


def test_normalise_legal_basis(indexed_notice):
extracted_metadata = XMLManifestationMetadataExtractor(xml_manifestation=indexed_notice.xml_manifestation).to_metadata()
extracted_metadata = XMLManifestationMetadataExtractor(
xml_manifestation=indexed_notice.xml_manifestation).to_metadata()
extracted_metadata_normaliser = ExtractedMetadataNormaliser(extracted_metadata=extracted_metadata)
assert "2009/81/EC" == extracted_metadata.legal_basis_directive
assert "32009L0081" == extracted_metadata_normaliser.normalise_legal_basis_value(
value=extracted_metadata.legal_basis_directive)


def test_get_map_value(indexed_notice):
extracted_metadata = XMLManifestationMetadataExtractor(xml_manifestation=indexed_notice.xml_manifestation).to_metadata()
extracted_metadata = XMLManifestationMetadataExtractor(
xml_manifestation=indexed_notice.xml_manifestation).to_metadata()
extracted_metadata_normaliser = ExtractedMetadataNormaliser(extracted_metadata=extracted_metadata)
value = extracted_metadata_normaliser.get_map_value(mapping=MappingFilesRegistry().countries, value="DE")
assert value == "http://publications.europa.eu/resource/authority/country/DEU"


def test_filter_df_by_variables():
df = MappingFilesRegistry().ef_notice_df
filtered_df = filter_df_by_variables(df=df, form_type="planning",
Expand All @@ -71,9 +80,9 @@ def test_filter_df_by_variables():
assert len(filtered_df.index) == 3
assert "32014L0024" in filtered_df["eform_legal_basis"].values


def test_get_form_type_and_notice_type(indexed_notice):
extracted_metadata = XMLManifestationMetadataExtractor(xml_manifestation=indexed_notice.xml_manifestation).to_metadata()
extracted_metadata = XMLManifestationMetadataExtractor(
xml_manifestation=indexed_notice.xml_manifestation).to_metadata()
extracted_metadata_normaliser = ExtractedMetadataNormaliser(extracted_metadata=extracted_metadata)
form_type, notice_type, legal_basis, eforms_subtype = extracted_metadata_normaliser.get_form_type_and_notice_type(
ef_map=MappingFilesRegistry().ef_notice_df,
Expand All @@ -86,9 +95,9 @@ def test_get_form_type_and_notice_type(indexed_notice):
assert "32014L0024" == legal_basis
assert "16" == eforms_subtype


def test_get_filter_values(indexed_notice):
extracted_metadata = XMLManifestationMetadataExtractor(xml_manifestation=indexed_notice.xml_manifestation).to_metadata()
extracted_metadata = XMLManifestationMetadataExtractor(
xml_manifestation=indexed_notice.xml_manifestation).to_metadata()
extracted_metadata_normaliser = ExtractedMetadataNormaliser(extracted_metadata=extracted_metadata)
filter_map = MappingFilesRegistry().filter_map_df
filter_variables_dict = extracted_metadata_normaliser.get_filter_variables_values(form_number="F07",
Expand All @@ -109,7 +118,6 @@ def test_get_filter_values(indexed_notice):
document_type_code="7",
legal_basis="legal")


def test_normalising_process_on_failed_notice_in_dag(notice_2021):
extracted_metadata = XMLManifestationMetadataExtractor(
xml_manifestation=notice_2021.xml_manifestation).to_metadata()
Expand Down