Skip to content

Commit

Permalink
fix: add a space after line identifier (#520)
Browse files Browse the repository at this point in the history
This is another important part to normalize: white spaces at start of
line.

Enables being consistent.
  • Loading branch information
alexgarel authored Jul 22, 2024
1 parent 83bbbe5 commit 51f7b43
Show file tree
Hide file tree
Showing 8 changed files with 137 additions and 140 deletions.
44 changes: 22 additions & 22 deletions backend/sample/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,36 @@

stopwords:fr: aux,au,de,le,du,la,a,et

synonyms:en:passion fruit, passionfruit
synonyms:en: passion fruit, passionfruit

synonyms:fr:fruit de la passion, maracuja, passion
synonyms:fr: fruit de la passion, maracuja, passion

en:yogurts, yoghurts
fr:yaourts, yoghourts, yogourts
en: yogurts, yoghurts
fr: yaourts, yoghourts, yogourts

<en:yogurts
en:banana yogurts
fr:yaourts à la banane
<en: yogurts
en: banana yogurts
fr: yaourts à la banane

<en:yogurts
en:Passion fruit yogurts
fr:yaourts au fruit de la passion
<en: yogurts
en: Passion fruit yogurts
fr: yaourts au fruit de la passion

<fr:yaourts fruit de la passion
fr:yaourts au fruit de la passion allégés
<fr: yaourts fruit de la passion
fr: yaourts au fruit de la passion allégés

# meat

en:meat
vegan:en:no
carbon_footprint_fr_foodges_value:fr:10
en: meat
vegan:en: no
carbon_footprint_fr_foodges_value:fr: 10

<en:meat
en:fake-meat
vegan:en:yes
<en: meat
en: fake-meat
vegan:en: yes

en:fake-stuff
en: fake-stuff

<en:fake-stuff
<en:fake-meat
en:fake-duck-meat
<en: fake-stuff
<en: fake-meat
en: fake-duck-meat
102 changes: 51 additions & 51 deletions backend/tests/data/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,94 +2,94 @@

stopwords:fr: aux,au,de,le,du,la,a,et

synonyms:en:passion fruit, passionfruit
synonyms:en: passion fruit, passionfruit

synonyms:fr:fruit de la passion, fruits de la passion, maracuja, passion
synonyms:fr: fruit de la passion, fruits de la passion, maracuja, passion

en:yogurts, yoghurts
fr:yaourts, yoghourts, yogourts
nl:yoghurts
en: yogurts, yoghurts
fr: yaourts, yoghourts, yogourts
nl: yoghurts
description:en: a yogurts of whatever type
description:fr: un yaourt de n'importe quel type
color:en: white
flavour:en: undef

<en:yogurts
en:banana yogurts
fr:yaourts à la banane
nl:bananenyoghurt
<en: yogurts
en: banana yogurts
fr: yaourts à la banane
nl: bananenyoghurt
description:en: a banana yogurt
description:fr: un yaourt à la banane
color:en: yellow
flavour:en: banana

<en:yogurts
en:Passion fruit yogurts
fr:yaourts au fruit de la passion
nl:yoghurts met passievrucht
<en: yogurts
en: Passion fruit yogurts
fr: yaourts au fruit de la passion
nl: yoghurts met passievrucht
color:en: undef
flavour:en: passion fruit

<en:yogurts
fr:yaourts allégés
<en: yogurts
fr: yaourts allégés

<fr:yoghourts
en:lemon yogurts
fr:yaourts au citron
nl:yoghurts met citroen
<fr: yoghourts
en: lemon yogurts
fr: yaourts au citron
nl: yoghurts met citroen
description:en: a yogurts with lemon inside
description:fr: un yaourt avec du citron
color:en: yellow
flavour:en: lemon

<fr:yaourts fruit de la passion
<fr:yaourts allégés
fr:yaourts au fruit de la passion allégés
nl:magere yoghurts met passievrucht
<fr: yaourts fruit de la passion
<fr: yaourts allégés
fr: yaourts au fruit de la passion allégés
nl: magere yoghurts met passievrucht


<fr:yaourts au citron
<fr:yaourts allégés
fr:yaourts au citron allégés
nl:magere citroenyoghurt
<fr: yaourts au citron
<fr: yaourts allégés
fr: yaourts au citron allégés
nl: magere citroenyoghurt
description:en: for light yogurts with lemon

<fr:yaourt
fr:yaourts à la myrtille
nl:bosbessenyoghurt
<fr: yaourt
fr: yaourts à la myrtille
nl: bosbessenyoghurt
flavour:en: blueberry
flavour:fr: myrtille

en:meat
vegan:en:no
carbon_footprint_fr_foodges_value:fr:10
en: meat
vegan:en: no
carbon_footprint_fr_foodges_value:fr: 10

<en:meat
en:beef
carbon_footprint_fr_foodges_value:fr:15
<en: meat
en: beef
carbon_footprint_fr_foodges_value:fr: 15

<en:beef
en:roast-beef
<en: beef
en: roast-beef

<en:meat
en:fake-meat
vegan:en:yes
<en: meat
en: fake-meat
vegan:en: yes
# undef will stop parents from transmitting a value
carbon_footprint_fr_foodges_value:fr:undef
carbon_footprint_fr_foodges_value:fr: undef

en:fake-stuff
en: fake-stuff

<en:fake-stuff
<en:fake-meat
en:fake-duck-meat
<en: fake-stuff
<en: fake-meat
en: fake-duck-meat

en:vegetable
vegan:en:yes
en: vegetable
vegan:en: yes

# the soup yogourt synonym is used to test suggestions matching xx: synonyms
en:soup
xx:something that means soup in every language, something else that means soup in every language
vegan:en:maybe
en: soup
xx: something that means soup in every language, something else that means soup in every language
vegan:en: maybe

<en:soup
en:vegan-soup
Expand Down
35 changes: 16 additions & 19 deletions parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,13 @@ def _normalize_entry_id(self, raw_id: str) -> str:
normalized_id = f"{lc}:{normalized_main_tag}"
return normalized_id

def _get_lc_value(self, line: str) -> tuple[str, list[str]]:
"""Get the language code "lc" and a list of normalized values"""
def _get_lc_value(self, line: str, remove_stopwords=True) -> tuple[str, list[str]]:
"""Get the language code "lc" and a list of values and normalized values"""
lc, line = line.split(":", 1)
new_line: list[str] = []
for word in line.split(","):
new_line.append(normalize_text(word, lc, stopwords=self.stopwords))
return lc, new_line
values = [word.strip() for word in line.split(",")]
stopwords = self.stopwords if remove_stopwords else []
tags = [normalize_text(word, lc, stopwords=stopwords) for word in values]
return lc, values, tags

def _set_data_id(self, data: NodeData, id: str, line_number: int) -> NodeData:
if not data.id:
Expand Down Expand Up @@ -253,12 +253,12 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
saved_nodes = []
index_stopwords = 0
index_synonyms = 0

# Check if it is correctly written
correctly_written = re.compile(r"\w+\Z")
# stopwords will contain a list of stopwords with their language code as key
self.stopwords = {}
# the other entries
# the first entry is after __header__ which was created before
data = NodeData(is_before="__header__")
line_number = (
entries_start_line # if the iterator is empty, line_number will not be unbound
Expand Down Expand Up @@ -305,43 +305,40 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
index_stopwords += 1
# remove "stopwords:" part
line = line[10:]
# compute raw values outside _get_lc_value as it normalizes them!
tags = [words.strip() for words in line[3:].split(",")]
try:
lc, value = self._get_lc_value(line)
lc, tags, tags_ids = self._get_lc_value(line, remove_stopwords=False)
except ValueError:
self.parser_logger.error(
f"Missing language code at line {line_number + 1} ? '{self.parser_logger.ellipsis(line)}'"
)
else:
data.tags["tags_" + lc] = tags
data.tags["tags_ids_" + lc] = value
# add the normalized list with its lc
self.stopwords[lc] = value
data.tags["tags_ids_" + lc] = tags_ids
# add the normalized list with its lc to current processing
self.stopwords[lc] = tags_ids
elif line.startswith("synonyms"):
# general synonyms definition for a language
id = "synonyms:" + str(index_synonyms)
data = self._set_data_id(data, id, line_number)
index_synonyms += 1
# remove "synonyms:" part
line = line[9:]
# compute raw values outside _get_lc_value as it normalizes them!
tags = [words.strip() for words in line[3:].split(",")]
try:
lc, value = self._get_lc_value(line)
lc, tags, tags_ids = self._get_lc_value(line)
except ValueError:
self.parser_logger.error(
f"Missing language code at line {line_number + 1} ? '{self.parser_logger.ellipsis(line)}'"
)
else:
data.tags["tags_" + lc] = tags
data.tags["tags_ids_" + lc] = value
data.tags["tags_ids_" + lc] = tags_ids
elif line[0] == "<":
# parent definition
data.parent_tags.append((self._normalize_entry_id(line[1:]), line_number + 1))
elif self.is_entry_synonyms_line(line):
# synonyms definition
if not data.id:
# the first item on the first line gives the id
data.id = self._normalize_entry_id(line.split(",", 1)[0])
# first 2-3 characters before ":" are the language code
data.main_language = data.id.split(":", 1)[0]
Expand Down Expand Up @@ -383,7 +380,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
)
if property_name:
prop_key = "prop_" + property_name + "_" + lc
data.properties[prop_key] = property_value
data.properties[prop_key] = property_value.strip()
data = self._get_node_data_with_comments_above_key(
data, line_number, prop_key
)
Expand Down
6 changes: 3 additions & 3 deletions parser/openfoodfacts_taxonomy_parser/unparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def list_tags_lc(self, node):
def get_tags_line(self, node, lc):
"""return a string that should look like the original line"""
line = (", ").join(node["tags_" + lc])
return lc + ":" + line
return lc + ": " + line

@staticmethod
def property_sort_key(property):
Expand All @@ -75,15 +75,15 @@ def get_property_line(self, node, property):
"""return a string that should look like the original property line"""
property_name, lc = property.rsplit("_", 1)
property_value = node["prop_" + property]
line = property_name + ":" + lc + ":" + property_value
line = property_name + ":" + lc + ": " + property_value
return line

def get_parents_lines(self, parents):
for parent in parents:
parent = dict(parent)
lc = parent["main_language"]
parent_id = parent["tags_" + lc][0]
yield "<" + lc + ":" + parent_id
yield "<" + lc + ": " + parent_id

def iter_lines(self, project_label):
previous_block_id = ""
Expand Down
46 changes: 23 additions & 23 deletions parser/tests/data/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,37 @@

stopwords:fr: aux,au,de,le,du,la,a,et,test normalisation

synonyms:en:passion fruit, passionfruit
synonyms:en: passion fruit, passionfruit

synonyms:fr:fruit de la passion, maracuja, passion
synonyms:fr: fruit de la passion, maracuja, passion

<en:milk
en:yogurts, yoghurts
fr:yaourts, yoghourts, yogourts
<en: milk
en: yogurts, yoghurts
fr: yaourts, yoghourts, yogourts

<en:yogurts
en:banana yogurts
fr:yaourts à la banane
<en: yogurts
en: banana yogurts
fr: yaourts à la banane

<en:yogurts
en:Passion fruit yogurts
fr:yaourts au fruit de la passion
<en: yogurts
en: Passion fruit yogurts
fr: yaourts au fruit de la passion

<fr:yaourts fruit de la passion
fr:yaourts au fruit de la passion allégés
<fr: yaourts fruit de la passion
fr: yaourts au fruit de la passion allégés

# meat

en:meat
carbon_footprint_fr_foodges_value:fr:10
vegan:en:no
en: meat
carbon_footprint_fr_foodges_value:fr: 10
vegan:en: no

<en:meat
en:fake-meat
vegan:en:yes
<en: meat
en: fake-meat
vegan:en: yes

en:fake-stuff
en: fake-stuff

<en:fake-stuff
<en:fake-meat
en:fake-duck-meat
<en: fake-stuff
<en: fake-meat
en: fake-duck-meat
Loading

0 comments on commit 51f7b43

Please sign in to comment.