diff --git a/backend/sample/test.txt b/backend/sample/test.txt index 09204d22..2b9cc030 100644 --- a/backend/sample/test.txt +++ b/backend/sample/test.txt @@ -2,36 +2,36 @@ stopwords:fr: aux,au,de,le,du,la,a,et -synonyms:en:passion fruit, passionfruit +synonyms:en: passion fruit, passionfruit -synonyms:fr:fruit de la passion, maracuja, passion +synonyms:fr: fruit de la passion, maracuja, passion -en:yogurts, yoghurts -fr:yaourts, yoghourts, yogourts +en: yogurts, yoghurts +fr: yaourts, yoghourts, yogourts - str: normalized_id = f"{lc}:{normalized_main_tag}" return normalized_id - def _get_lc_value(self, line: str) -> tuple[str, list[str]]: - """Get the language code "lc" and a list of normalized values""" + def _get_lc_value(self, line: str, remove_stopwords=True) -> tuple[str, list[str]]: + """Get the language code "lc" and a list of values and normalized values""" lc, line = line.split(":", 1) - new_line: list[str] = [] - for word in line.split(","): - new_line.append(normalize_text(word, lc, stopwords=self.stopwords)) - return lc, new_line + values = [word.strip() for word in line.split(",")] + stopwords = self.stopwords if remove_stopwords else [] + tags = [normalize_text(word, lc, stopwords=stopwords) for word in values] + return lc, values, tags def _set_data_id(self, data: NodeData, id: str, line_number: int) -> NodeData: if not data.id: @@ -253,12 +253,12 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N saved_nodes = [] index_stopwords = 0 index_synonyms = 0 - + # Check if it is correctly written correctly_written = re.compile(r"\w+\Z") # stopwords will contain a list of stopwords with their language code as key self.stopwords = {} - # the other entries + # the first entry is after __header__ which was created before data = NodeData(is_before="__header__") line_number = ( entries_start_line # if the iterator is empty, line_number will not be unbound @@ -305,19 +305,17 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N index_stopwords += 1 # remove "stopwords:" part line = line[10:] - # compute raw values outside _get_lc_value as it normalizes them! - tags = [words.strip() for words in line[3:].split(",")] try: - lc, value = self._get_lc_value(line) + lc, tags, tags_ids = self._get_lc_value(line, remove_stopwords=False) except ValueError: self.parser_logger.error( f"Missing language code at line {line_number + 1} ? '{self.parser_logger.ellipsis(line)}'" ) else: data.tags["tags_" + lc] = tags - data.tags["tags_ids_" + lc] = value - # add the normalized list with its lc - self.stopwords[lc] = value + data.tags["tags_ids_" + lc] = tags_ids + # add the normalized list with its lc to current processing + self.stopwords[lc] = tags_ids elif line.startswith("synonyms"): # general synonyms definition for a language id = "synonyms:" + str(index_synonyms) @@ -325,23 +323,22 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N index_synonyms += 1 # remove "synonyms:" part line = line[9:] - # compute raw values outside _get_lc_value as it normalizes them! - tags = [words.strip() for words in line[3:].split(",")] try: - lc, value = self._get_lc_value(line) + lc, tags, tags_ids = self._get_lc_value(line) except ValueError: self.parser_logger.error( f"Missing language code at line {line_number + 1} ? '{self.parser_logger.ellipsis(line)}'" ) else: data.tags["tags_" + lc] = tags - data.tags["tags_ids_" + lc] = value + data.tags["tags_ids_" + lc] = tags_ids elif line[0] == "<": # parent definition data.parent_tags.append((self._normalize_entry_id(line[1:]), line_number + 1)) elif self.is_entry_synonyms_line(line): # synonyms definition if not data.id: + # the first item on the first line gives the id data.id = self._normalize_entry_id(line.split(",", 1)[0]) # first 2-3 characters before ":" are the language code data.main_language = data.id.split(":", 1)[0] @@ -383,7 +380,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N ) if property_name: prop_key = "prop_" + property_name + "_" + lc - data.properties[prop_key] = property_value + data.properties[prop_key] = property_value.strip() data = self._get_node_data_with_comments_above_key( data, line_number, prop_key ) diff --git a/parser/openfoodfacts_taxonomy_parser/unparser.py b/parser/openfoodfacts_taxonomy_parser/unparser.py index c8222187..158c2578 100644 --- a/parser/openfoodfacts_taxonomy_parser/unparser.py +++ b/parser/openfoodfacts_taxonomy_parser/unparser.py @@ -54,7 +54,7 @@ def list_tags_lc(self, node): def get_tags_line(self, node, lc): """return a string that should look like the original line""" line = (", ").join(node["tags_" + lc]) - return lc + ":" + line + return lc + ": " + line @staticmethod def property_sort_key(property): @@ -75,7 +75,7 @@ def get_property_line(self, node, property): """return a string that should look like the original property line""" property_name, lc = property.rsplit("_", 1) property_value = node["prop_" + property] - line = property_name + ":" + lc + ":" + property_value + line = property_name + ":" + lc + ": " + property_value return line def get_parents_lines(self, parents): @@ -83,7 +83,7 @@ def get_parents_lines(self, parents): parent = dict(parent) lc = parent["main_language"] parent_id = parent["tags_" + lc][0] - yield "<" + lc + ":" + parent_id + yield "<" + lc + ": " + parent_id def iter_lines(self, project_label): previous_block_id = "" diff --git a/parser/tests/data/test.txt b/parser/tests/data/test.txt index f574d14c..92f38f86 100644 --- a/parser/tests/data/test.txt +++ b/parser/tests/data/test.txt @@ -2,37 +2,37 @@ stopwords:fr: aux,au,de,le,du,la,a,et,test normalisation -synonyms:en:passion fruit, passionfruit +synonyms:en: passion fruit, passionfruit -synonyms:fr:fruit de la passion, maracuja, passion +synonyms:fr: fruit de la passion, maracuja, passion -