Merge branch 'main' into fix-more-formating

openfoodfacts · Jul 23, 2024 · 1ff1274 · 1ff1274
2 parents b2c6eb1 + 0863d52
commit 1ff1274
Show file tree

Hide file tree

Showing 11 changed files with 96 additions and 111 deletions.
diff --git a/.github/workflows/top-issues.yml b/.github/workflows/top-issues.yml
@@ -0,0 +1,27 @@
+name: Top issues action.
+#on:
+#  schedule:
+#  - cron:  '0 0 */1 * *'
+on:
+  issues:
+    types: [opened, transferred]
+
+jobs:
+  ShowAndLabelTopIssues:
+    name: Display and label top issues.
+    runs-on: ubuntu-latest
+    steps:
+      - name: Run top issues action
+        uses: rickstaa/top-issues-action@v1
+        env:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          label: true
+          dashboard: true
+          dashboard_title: 👍 Top Issues Dashboard
+          dashboard_show_total_reactions: true
+          top_issues: true
+          top_bugs: true
+          top_features: true
+          top_pull_requests: true
+          top_list_size: 20
diff --git a/Makefile b/Makefile
@@ -123,12 +123,18 @@ generate_sdk: ## Generate client SDK from OpenAPI spec
 	${DOCKER_COMPOSE} run --rm taxonomy_node npm run generate:api
 
 # lint code
-lint: backend_lint frontend_lint config_lint ## Run all linters
+lint: parser_lint backend_lint frontend_lint config_lint ## Run all linters
 
 backend_lint: ## Run lint on backend code
 	@echo "🍜 Linting python code"
-	${DOCKER_COMPOSE} run --rm taxonomy_api isort . /parser
-	${DOCKER_COMPOSE} run --rm taxonomy_api black . /parser
+	${DOCKER_COMPOSE} run --rm taxonomy_api isort .
+	${DOCKER_COMPOSE} run --rm taxonomy_api black .
+
+parser_lint: ## Run lint on parser code
+	@echo "🍜 Linting python code"
+	${DOCKER_COMPOSE} run --rm -w /parser taxonomy_api isort /parser
+	${DOCKER_COMPOSE} run --rm -w /parser taxonomy_api black /parser
+
 
 frontend_lint: ## Run lint on frontend code
 	@echo "🍜 Linting react code"
@@ -141,14 +147,21 @@ config_lint: ## Run on lint configuration files
 
 
 # check code quality
-quality: backend_quality frontend_quality config_quality ## Run all quality checks
+quality: parser_quality backend_quality frontend_quality config_quality ## Run all quality checks
 
 backend_quality: ## Run quality checks on backend code
 	@echo "🍜 Quality checks python"
 	${DOCKER_COMPOSE} run --rm taxonomy_api flake8 --exclude=.venv .
 	${DOCKER_COMPOSE} run --rm taxonomy_api isort --check-only --skip .venv .
 	${DOCKER_COMPOSE} run --rm taxonomy_api black --check --exclude=.venv .
 
+parser_quality: ## Run quality checks on backend code
+	@echo "🍜 Quality checks python"
+	${DOCKER_COMPOSE} run --rm  -w /parser taxonomy_api flake8 --exclude=.venv /parser
+	${DOCKER_COMPOSE} run --rm  -w /parser taxonomy_api isort --check-only --skip .venv /parser
+	${DOCKER_COMPOSE} run --rm  -w /parser taxonomy_api black --check --exclude=.venv /parser
+
+
 frontend_quality: ## Run quality checks on frontend code
 	@echo "🍜 Quality checks JS"
 	${DOCKER_COMPOSE} run --rm taxonomy_node npx eslint --no-fix src/

diff --git a/parser/openfoodfacts_taxonomy_parser/parser/__init__.py b/parser/openfoodfacts_taxonomy_parser/parser/__init__.py
@@ -1,2 +1,2 @@
-from .parser import Parser
-from .taxonomy_parser import TaxonomyParser
+from .parser import Parser  # noqa: F401
+from .taxonomy_parser import TaxonomyParser  # noqa: F401
diff --git a/parser/openfoodfacts_taxonomy_parser/parser/parser.py b/parser/openfoodfacts_taxonomy_parser/parser/parser.py
@@ -9,14 +9,7 @@
 
 from ..utils import get_project_name, normalize_text
 from .logger import ParserConsoleLogger
-from .taxonomy_parser import (
-    ChildLink,
-    NodeData,
-    NodeType,
-    PreviousLink,
-    Taxonomy,
-    TaxonomyParser,
-)
+from .taxonomy_parser import ChildLink, NodeData, NodeType, PreviousLink, Taxonomy, TaxonomyParser
 
 
 class Parser:
@@ -26,9 +19,7 @@ def __init__(self, session: Session):
         self.session = session
         self.parser_logger = ParserConsoleLogger()
 
-    def _create_other_node(
-        self, tx: Transaction, node_data: NodeData, project_label: str
-    ):
+    def _create_other_node(self, tx: Transaction, node_data: NodeData, project_label: str):
         """Create a TEXT, SYNONYMS or STOPWORDS node"""
         if node_data.get_node_type() == NodeType.TEXT:
             type_label = "TEXT"
@@ -97,26 +88,20 @@ def _create_entry_nodes(self, entry_nodes: list[NodeData], project_label: str):
             original_taxonomy: entry_node.original_taxonomy
         """
 
-        properties_query = ",\n".join(
-            [base_properties_query, *additional_properties_queries]
-        )
+        properties_query = ",\n".join([base_properties_query, *additional_properties_queries])
 
         query = f"""
           WITH $entry_nodes as entry_nodes
           UNWIND entry_nodes as entry_node
           CREATE (n:{project_label}:ENTRY {{ {properties_query} }})
         """
-        self.session.run(
-            query, entry_nodes=[entry_node.to_dict() for entry_node in entry_nodes]
-        )
+        self.session.run(query, entry_nodes=[entry_node.to_dict() for entry_node in entry_nodes])
 
         self.parser_logger.info(
             f"Created {len(entry_nodes)} ENTRY nodes in {timeit.default_timer() - start_time} seconds"
         )
 
-    def _create_previous_links(
-        self, previous_links: list[PreviousLink], project_label: str
-    ):
+    def _create_previous_links(self, previous_links: list[PreviousLink], project_label: str):
         """Create the 'is_before' relations between nodes"""
         self.parser_logger.info("Creating 'is_before' links")
         start_time = timeit.default_timer()
@@ -225,9 +210,7 @@ def _create_node_fulltext_index(self, project_label: str):
         )
         self.session.run(query)
 
-        language_codes = [
-            lang.alpha2 for lang in list(iso639.languages) if lang.alpha2 != ""
-        ]
+        language_codes = [lang.alpha2 for lang in list(iso639.languages) if lang.alpha2 != ""]
         tags_prefixed_lc = ["n.tags_ids_" + lc for lc in language_codes]
         tags_prefixed_lc = ", ".join(tags_prefixed_lc)
         query = f"""CREATE FULLTEXT INDEX {project_label+'_SearchTagsIds'} IF NOT EXISTS
@@ -242,13 +225,9 @@ def _create_node_indexes(self, project_label: str):
         self._create_node_id_index(project_label)
         self._create_node_fulltext_index(project_label)
 
-        self.parser_logger.info(
-            f"Created indexes in {timeit.default_timer() - start_time} seconds"
-        )
+        self.parser_logger.info(f"Created indexes in {timeit.default_timer() - start_time} seconds")
 
-    def _write_to_database(
-        self, taxonomy: Taxonomy, taxonomy_name: str, branch_name: str
-    ):
+    def _write_to_database(self, taxonomy: Taxonomy, taxonomy_name: str, branch_name: str):
         project_label = get_project_name(taxonomy_name, branch_name)
         # First create nodes, then create node indexes to accelerate relationship creation, then create relationships
         self._create_other_nodes(taxonomy.other_nodes, project_label)

diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
@@ -220,9 +220,7 @@ def _get_node_data_with_comments_above_key(
         # Get comments just above the given line
         comments_above = []
         current_line = line_number - 1
-        while (
-            new_data.comments_stack and new_data.comments_stack[-1][0] == current_line
-        ):
+        while new_data.comments_stack and new_data.comments_stack[-1][0] == current_line:
             comments_above.append(new_data.comments_stack.pop()[1])
             current_line -= 1
         if comments_above:
@@ -260,9 +258,7 @@ def is_entry_synonyms_line(self, line):
             return not (self._language_code_prefix.match(line[matching_prefix.end() :]))
         return False
 
-    def _harvest_entries(
-        self, filename: str, entries_start_line: int
-    ) -> Iterator[NodeData]:
+    def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[NodeData]:
         """Transform data from file to dictionary"""
         saved_nodes = []
         index_stopwords = 0
@@ -274,7 +270,9 @@ def _harvest_entries(
         self.stopwords = {}
         # the first entry is after __header__ which was created before
         data = NodeData(is_before="__header__")
-        line_number = entries_start_line  # if the iterator is empty, line_number will not be unbound
+        line_number = (
+            entries_start_line  # if the iterator is empty, line_number will not be unbound
+        )
         for line_number, raw_line in self._file_iter(filename, entries_start_line):
             # yield data if block ended
             if self._entry_end(raw_line, data):
@@ -318,9 +316,7 @@ def _harvest_entries(
                     # remove "stopwords:" part
                     line = line[10:]
                     try:
-                        lc, tags, tags_ids = self._get_lc_value(
-                            line, remove_stopwords=False
-                        )
+                        lc, tags, tags_ids = self._get_lc_value(line, remove_stopwords=False)
                     except ValueError:
                         self.parser_logger.error(
                             f"Missing language code at line {line_number + 1} ? '{self.parser_logger.ellipsis(line)}'"
@@ -348,9 +344,7 @@ def _harvest_entries(
                         data.tags["tags_ids_" + lc] = tags_ids
                 elif line[0] == "<":
                     # parent definition
-                    data.parent_tags.append(
-                        (self._normalize_entry_id(line[1:]), line_number + 1)
-                    )
+                    data.parent_tags.append((self._normalize_entry_id(line[1:]), line_number + 1))
                 elif self.is_entry_synonyms_line(line):
                     # synonyms definition
                     if not data.id:
@@ -366,9 +360,7 @@ def _harvest_entries(
                     tagsids_list = []
                     for word in line.split(","):
                         tags_list.append(self.undo_normalize_text(word.strip()))
-                        word_normalized = normalize_text(
-                            word, lang, stopwords=self.stopwords
-                        )
+                        word_normalized = normalize_text(word, lang, stopwords=self.stopwords)
                         if word_normalized not in tagsids_list:
                             # in case 2 normalized synonyms are the same
                             tagsids_list.append(word_normalized)
@@ -391,8 +383,7 @@ def _harvest_entries(
                         property_name = property_name.strip()
                         lc = lc.strip().replace("-", "_")
                         if not (
-                            correctly_written.match(property_name)
-                            and correctly_written.match(lc)
+                            correctly_written.match(property_name) and correctly_written.match(lc)
                         ):
                             self.parser_logger.error(
                                 f"Reading error at line {line_number + 1}, unexpected format: '{self.parser_logger.ellipsis(line)}'"
@@ -427,9 +418,7 @@ def _normalise_and_validate_child_links(
             # we collect all the tags_ids in a certain language
             tags_ids = {}
             for node in entry_nodes:
-                node_tags_ids = {
-                    tag_id: node.id for tag_id in node.tags.get(f"tags_ids_{lc}", [])
-                }
+                node_tags_ids = {tag_id: node.id for tag_id in node.tags.get(f"tags_ids_{lc}", [])}
                 tags_ids.update(node_tags_ids)
 
             # we check if the parent_id exists in the tags_ids
@@ -438,9 +427,7 @@ def _normalise_and_validate_child_links(
                 if parent_id not in tags_ids:
                     missing_child_links.append(child_link)
                 else:
-                    child_link["parent_id"] = tags_ids[
-                        parent_id
-                    ]  # normalise the parent_id
+                    child_link["parent_id"] = tags_ids[parent_id]  # normalise the parent_id
                     normalised_child_links.append(child_link)
 
         return normalised_child_links, missing_child_links
@@ -466,10 +453,8 @@ def _get_valid_child_links(
         ]
 
         # Normalise and validate the unnormalised links
-        normalised_child_links, missing_child_links = (
-            self._normalise_and_validate_child_links(
-                entry_nodes, child_links_to_normalise
-            )
+        normalised_child_links, missing_child_links = self._normalise_and_validate_child_links(
+            entry_nodes, child_links_to_normalise
         )
 
         valid_child_links.extend(normalised_child_links)
@@ -483,9 +468,7 @@ def _get_valid_child_links(
 
         return valid_child_links
 
-    def _remove_duplicate_child_links(
-        self, child_links: list[ChildLink]
-    ) -> list[ChildLink]:
+    def _remove_duplicate_child_links(self, child_links: list[ChildLink]) -> list[ChildLink]:
         """Remove duplicate child links (i.e child links with the same parent_id and id)"""
         unique_child_links = []
         children_to_parents = collections.defaultdict(set)
@@ -496,9 +479,7 @@ def _remove_duplicate_child_links(
                 unique_child_links.append(child_link)
         return unique_child_links
 
-    def _merge_duplicate_entry_nodes(
-        self, entry_nodes: list[NodeData]
-    ) -> list[NodeData]:
+    def _merge_duplicate_entry_nodes(self, entry_nodes: list[NodeData]) -> list[NodeData]:
         """Merge entry nodes with the same id:
         - merge their tags (union)
         - merge their properties (union, and in case of conflict, keep the last value)
@@ -568,9 +549,7 @@ def _create_taxonomy(
         entry_nodes: list[NodeData] = []
         entry_nodes.extend(external_entry_nodes)
         other_nodes = [
-            NodeData(
-                id="__header__", preceding_lines=harvested_header_data, src_position=1
-            )
+            NodeData(id="__header__", preceding_lines=harvested_header_data, src_position=1)
         ]
         previous_links: list[PreviousLink] = []
         raw_child_links: list[ChildLink] = []
@@ -582,9 +561,7 @@ def _create_taxonomy(
             else:
                 other_nodes.append(entry)
             if entry.is_before:
-                previous_links.append(
-                    PreviousLink(before_id=entry.is_before, id=entry.id)
-                )
+                previous_links.append(PreviousLink(before_id=entry.is_before, id=entry.id))
             if entry.parent_tags:
                 for position, (parent, line_position) in enumerate(entry.parent_tags):
                     raw_child_links.append(
@@ -617,9 +594,7 @@ def parse_file(
         start_time = timeit.default_timer()
         filename = normalize_filename(filename)
         taxonomy = self._create_taxonomy(filename, external_filenames)
-        self.parser_logger.info(
-            f"Parsing done in {timeit.default_timer() - start_time} seconds."
-        )
+        self.parser_logger.info(f"Parsing done in {timeit.default_timer() - start_time} seconds.")
         self.parser_logger.info(
             f"Found {len(taxonomy.entry_nodes) + len(taxonomy.other_nodes)} nodes"
         )

diff --git a/parser/openfoodfacts_taxonomy_parser/unparser.py b/parser/openfoodfacts_taxonomy_parser/unparser.py
@@ -17,7 +17,8 @@ def get_all_nodes(self, project_label):
         this function use the relationships between nodes"""
         # This query first lists all the nodes in the "is_before" order
         # then for each node in the path, it finds its parents
-        # and finally it returns the node and its parents (the parents are ordered in the same order as in the original file)
+        # and finally it returns the node and its parents
+        # (the parents are ordered in the same order as in the original file)
         # Note: OPTIONAL MATCH is used to return nodes without parents
         query = f"""
             MATCH path = ShortestPath(
@@ -95,9 +96,9 @@ def iter_lines(self, project_label):
             node = dict(node)
             has_content = node["id"] not in ["__header__", "__footer__"]
             # eventually add a blank line but in specific case
-            following_synonyms = node["id"].startswith(
+            following_synonyms = node["id"].startswith("synonyms") and previous_block_id.startswith(
                 "synonyms"
-            ) and previous_block_id.startswith("synonyms")
+            )
             following_stopwords = node["id"].startswith(
                 "stopwords"
             ) and previous_block_id.startswith("stopwords")

diff --git a/parser/openfoodfacts_taxonomy_parser/utils.py b/parser/openfoodfacts_taxonomy_parser/utils.py
@@ -48,9 +48,7 @@ def normalize_text(
         stopwords = stopwords[lang]
         line_surrounded_by_char = char + line + char
         for stopword in stopwords:
-            line_surrounded_by_char = line_surrounded_by_char.replace(
-                char + stopword + char, char
-            )
+            line_surrounded_by_char = line_surrounded_by_char.replace(char + stopword + char, char)
         line = line_surrounded_by_char[1:-1]
 
     return line

diff --git a/parser/tests/integration/test_parse_unparse_integration.py b/parser/tests/integration/test_parse_unparse_integration.py
@@ -1,16 +1,13 @@
 import pathlib
 
 import pytest
+
 from openfoodfacts_taxonomy_parser import parser, unparser
 
 # taxonomy in text format : test.txt
 TEST_TAXONOMY_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test.txt")
-TEST_EXTERNAL_1_TXT = str(
-    pathlib.Path(__file__).parent.parent / "data" / "test_external1.txt"
-)
-TEST_EXTERNAL_2_TXT = str(
-    pathlib.Path(__file__).parent.parent / "data" / "test_external2.txt"
-)
+TEST_EXTERNAL_1_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test_external1.txt")
+TEST_EXTERNAL_2_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test_external2.txt")
 
 
 @pytest.fixture(autouse=True)