Skip to content

Commit

Permalink
Merge branch 'main' into fix-more-formating
Browse files Browse the repository at this point in the history
  • Loading branch information
alexgarel committed Jul 23, 2024
2 parents b2c6eb1 + 0863d52 commit 1ff1274
Show file tree
Hide file tree
Showing 11 changed files with 96 additions and 111 deletions.
27 changes: 27 additions & 0 deletions .github/workflows/top-issues.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Top issues action.
#on:
# schedule:
# - cron: '0 0 */1 * *'
on:
issues:
types: [opened, transferred]

jobs:
ShowAndLabelTopIssues:
name: Display and label top issues.
runs-on: ubuntu-latest
steps:
- name: Run top issues action
uses: rickstaa/top-issues-action@v1
env:
github_token: ${{ secrets.GITHUB_TOKEN }}
with:
label: true
dashboard: true
dashboard_title: 👍 Top Issues Dashboard
dashboard_show_total_reactions: true
top_issues: true
top_bugs: true
top_features: true
top_pull_requests: true
top_list_size: 20
21 changes: 17 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -123,12 +123,18 @@ generate_sdk: ## Generate client SDK from OpenAPI spec
${DOCKER_COMPOSE} run --rm taxonomy_node npm run generate:api

# lint code
lint: backend_lint frontend_lint config_lint ## Run all linters
lint: parser_lint backend_lint frontend_lint config_lint ## Run all linters

backend_lint: ## Run lint on backend code
@echo "🍜 Linting python code"
${DOCKER_COMPOSE} run --rm taxonomy_api isort . /parser
${DOCKER_COMPOSE} run --rm taxonomy_api black . /parser
${DOCKER_COMPOSE} run --rm taxonomy_api isort .
${DOCKER_COMPOSE} run --rm taxonomy_api black .

parser_lint: ## Run lint on parser code
@echo "🍜 Linting python code"
${DOCKER_COMPOSE} run --rm -w /parser taxonomy_api isort /parser
${DOCKER_COMPOSE} run --rm -w /parser taxonomy_api black /parser


frontend_lint: ## Run lint on frontend code
@echo "🍜 Linting react code"
Expand All @@ -141,14 +147,21 @@ config_lint: ## Run on lint configuration files


# check code quality
quality: backend_quality frontend_quality config_quality ## Run all quality checks
quality: parser_quality backend_quality frontend_quality config_quality ## Run all quality checks

backend_quality: ## Run quality checks on backend code
@echo "🍜 Quality checks python"
${DOCKER_COMPOSE} run --rm taxonomy_api flake8 --exclude=.venv .
${DOCKER_COMPOSE} run --rm taxonomy_api isort --check-only --skip .venv .
${DOCKER_COMPOSE} run --rm taxonomy_api black --check --exclude=.venv .

parser_quality: ## Run quality checks on backend code
@echo "🍜 Quality checks python"
${DOCKER_COMPOSE} run --rm -w /parser taxonomy_api flake8 --exclude=.venv /parser
${DOCKER_COMPOSE} run --rm -w /parser taxonomy_api isort --check-only --skip .venv /parser
${DOCKER_COMPOSE} run --rm -w /parser taxonomy_api black --check --exclude=.venv /parser


frontend_quality: ## Run quality checks on frontend code
@echo "🍜 Quality checks JS"
${DOCKER_COMPOSE} run --rm taxonomy_node npx eslint --no-fix src/
Expand Down
4 changes: 2 additions & 2 deletions parser/openfoodfacts_taxonomy_parser/parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .parser import Parser
from .taxonomy_parser import TaxonomyParser
from .parser import Parser # noqa: F401
from .taxonomy_parser import TaxonomyParser # noqa: F401
37 changes: 8 additions & 29 deletions parser/openfoodfacts_taxonomy_parser/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,7 @@

from ..utils import get_project_name, normalize_text
from .logger import ParserConsoleLogger
from .taxonomy_parser import (
ChildLink,
NodeData,
NodeType,
PreviousLink,
Taxonomy,
TaxonomyParser,
)
from .taxonomy_parser import ChildLink, NodeData, NodeType, PreviousLink, Taxonomy, TaxonomyParser


class Parser:
Expand All @@ -26,9 +19,7 @@ def __init__(self, session: Session):
self.session = session
self.parser_logger = ParserConsoleLogger()

def _create_other_node(
self, tx: Transaction, node_data: NodeData, project_label: str
):
def _create_other_node(self, tx: Transaction, node_data: NodeData, project_label: str):
"""Create a TEXT, SYNONYMS or STOPWORDS node"""
if node_data.get_node_type() == NodeType.TEXT:
type_label = "TEXT"
Expand Down Expand Up @@ -97,26 +88,20 @@ def _create_entry_nodes(self, entry_nodes: list[NodeData], project_label: str):
original_taxonomy: entry_node.original_taxonomy
"""

properties_query = ",\n".join(
[base_properties_query, *additional_properties_queries]
)
properties_query = ",\n".join([base_properties_query, *additional_properties_queries])

query = f"""
WITH $entry_nodes as entry_nodes
UNWIND entry_nodes as entry_node
CREATE (n:{project_label}:ENTRY {{ {properties_query} }})
"""
self.session.run(
query, entry_nodes=[entry_node.to_dict() for entry_node in entry_nodes]
)
self.session.run(query, entry_nodes=[entry_node.to_dict() for entry_node in entry_nodes])

self.parser_logger.info(
f"Created {len(entry_nodes)} ENTRY nodes in {timeit.default_timer() - start_time} seconds"
)

def _create_previous_links(
self, previous_links: list[PreviousLink], project_label: str
):
def _create_previous_links(self, previous_links: list[PreviousLink], project_label: str):
"""Create the 'is_before' relations between nodes"""
self.parser_logger.info("Creating 'is_before' links")
start_time = timeit.default_timer()
Expand Down Expand Up @@ -225,9 +210,7 @@ def _create_node_fulltext_index(self, project_label: str):
)
self.session.run(query)

language_codes = [
lang.alpha2 for lang in list(iso639.languages) if lang.alpha2 != ""
]
language_codes = [lang.alpha2 for lang in list(iso639.languages) if lang.alpha2 != ""]
tags_prefixed_lc = ["n.tags_ids_" + lc for lc in language_codes]
tags_prefixed_lc = ", ".join(tags_prefixed_lc)
query = f"""CREATE FULLTEXT INDEX {project_label+'_SearchTagsIds'} IF NOT EXISTS
Expand All @@ -242,13 +225,9 @@ def _create_node_indexes(self, project_label: str):
self._create_node_id_index(project_label)
self._create_node_fulltext_index(project_label)

self.parser_logger.info(
f"Created indexes in {timeit.default_timer() - start_time} seconds"
)
self.parser_logger.info(f"Created indexes in {timeit.default_timer() - start_time} seconds")

def _write_to_database(
self, taxonomy: Taxonomy, taxonomy_name: str, branch_name: str
):
def _write_to_database(self, taxonomy: Taxonomy, taxonomy_name: str, branch_name: str):
project_label = get_project_name(taxonomy_name, branch_name)
# First create nodes, then create node indexes to accelerate relationship creation, then create relationships
self._create_other_nodes(taxonomy.other_nodes, project_label)
Expand Down
61 changes: 18 additions & 43 deletions parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,9 +220,7 @@ def _get_node_data_with_comments_above_key(
# Get comments just above the given line
comments_above = []
current_line = line_number - 1
while (
new_data.comments_stack and new_data.comments_stack[-1][0] == current_line
):
while new_data.comments_stack and new_data.comments_stack[-1][0] == current_line:
comments_above.append(new_data.comments_stack.pop()[1])
current_line -= 1
if comments_above:
Expand Down Expand Up @@ -260,9 +258,7 @@ def is_entry_synonyms_line(self, line):
return not (self._language_code_prefix.match(line[matching_prefix.end() :]))
return False

def _harvest_entries(
self, filename: str, entries_start_line: int
) -> Iterator[NodeData]:
def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[NodeData]:
"""Transform data from file to dictionary"""
saved_nodes = []
index_stopwords = 0
Expand All @@ -274,7 +270,9 @@ def _harvest_entries(
self.stopwords = {}
# the first entry is after __header__ which was created before
data = NodeData(is_before="__header__")
line_number = entries_start_line # if the iterator is empty, line_number will not be unbound
line_number = (
entries_start_line # if the iterator is empty, line_number will not be unbound
)
for line_number, raw_line in self._file_iter(filename, entries_start_line):
# yield data if block ended
if self._entry_end(raw_line, data):
Expand Down Expand Up @@ -318,9 +316,7 @@ def _harvest_entries(
# remove "stopwords:" part
line = line[10:]
try:
lc, tags, tags_ids = self._get_lc_value(
line, remove_stopwords=False
)
lc, tags, tags_ids = self._get_lc_value(line, remove_stopwords=False)
except ValueError:
self.parser_logger.error(
f"Missing language code at line {line_number + 1} ? '{self.parser_logger.ellipsis(line)}'"
Expand Down Expand Up @@ -348,9 +344,7 @@ def _harvest_entries(
data.tags["tags_ids_" + lc] = tags_ids
elif line[0] == "<":
# parent definition
data.parent_tags.append(
(self._normalize_entry_id(line[1:]), line_number + 1)
)
data.parent_tags.append((self._normalize_entry_id(line[1:]), line_number + 1))
elif self.is_entry_synonyms_line(line):
# synonyms definition
if not data.id:
Expand All @@ -366,9 +360,7 @@ def _harvest_entries(
tagsids_list = []
for word in line.split(","):
tags_list.append(self.undo_normalize_text(word.strip()))
word_normalized = normalize_text(
word, lang, stopwords=self.stopwords
)
word_normalized = normalize_text(word, lang, stopwords=self.stopwords)
if word_normalized not in tagsids_list:
# in case 2 normalized synonyms are the same
tagsids_list.append(word_normalized)
Expand All @@ -391,8 +383,7 @@ def _harvest_entries(
property_name = property_name.strip()
lc = lc.strip().replace("-", "_")
if not (
correctly_written.match(property_name)
and correctly_written.match(lc)
correctly_written.match(property_name) and correctly_written.match(lc)
):
self.parser_logger.error(
f"Reading error at line {line_number + 1}, unexpected format: '{self.parser_logger.ellipsis(line)}'"
Expand Down Expand Up @@ -427,9 +418,7 @@ def _normalise_and_validate_child_links(
# we collect all the tags_ids in a certain language
tags_ids = {}
for node in entry_nodes:
node_tags_ids = {
tag_id: node.id for tag_id in node.tags.get(f"tags_ids_{lc}", [])
}
node_tags_ids = {tag_id: node.id for tag_id in node.tags.get(f"tags_ids_{lc}", [])}
tags_ids.update(node_tags_ids)

# we check if the parent_id exists in the tags_ids
Expand All @@ -438,9 +427,7 @@ def _normalise_and_validate_child_links(
if parent_id not in tags_ids:
missing_child_links.append(child_link)
else:
child_link["parent_id"] = tags_ids[
parent_id
] # normalise the parent_id
child_link["parent_id"] = tags_ids[parent_id] # normalise the parent_id
normalised_child_links.append(child_link)

return normalised_child_links, missing_child_links
Expand All @@ -466,10 +453,8 @@ def _get_valid_child_links(
]

# Normalise and validate the unnormalised links
normalised_child_links, missing_child_links = (
self._normalise_and_validate_child_links(
entry_nodes, child_links_to_normalise
)
normalised_child_links, missing_child_links = self._normalise_and_validate_child_links(
entry_nodes, child_links_to_normalise
)

valid_child_links.extend(normalised_child_links)
Expand All @@ -483,9 +468,7 @@ def _get_valid_child_links(

return valid_child_links

def _remove_duplicate_child_links(
self, child_links: list[ChildLink]
) -> list[ChildLink]:
def _remove_duplicate_child_links(self, child_links: list[ChildLink]) -> list[ChildLink]:
"""Remove duplicate child links (i.e child links with the same parent_id and id)"""
unique_child_links = []
children_to_parents = collections.defaultdict(set)
Expand All @@ -496,9 +479,7 @@ def _remove_duplicate_child_links(
unique_child_links.append(child_link)
return unique_child_links

def _merge_duplicate_entry_nodes(
self, entry_nodes: list[NodeData]
) -> list[NodeData]:
def _merge_duplicate_entry_nodes(self, entry_nodes: list[NodeData]) -> list[NodeData]:
"""Merge entry nodes with the same id:
- merge their tags (union)
- merge their properties (union, and in case of conflict, keep the last value)
Expand Down Expand Up @@ -568,9 +549,7 @@ def _create_taxonomy(
entry_nodes: list[NodeData] = []
entry_nodes.extend(external_entry_nodes)
other_nodes = [
NodeData(
id="__header__", preceding_lines=harvested_header_data, src_position=1
)
NodeData(id="__header__", preceding_lines=harvested_header_data, src_position=1)
]
previous_links: list[PreviousLink] = []
raw_child_links: list[ChildLink] = []
Expand All @@ -582,9 +561,7 @@ def _create_taxonomy(
else:
other_nodes.append(entry)
if entry.is_before:
previous_links.append(
PreviousLink(before_id=entry.is_before, id=entry.id)
)
previous_links.append(PreviousLink(before_id=entry.is_before, id=entry.id))
if entry.parent_tags:
for position, (parent, line_position) in enumerate(entry.parent_tags):
raw_child_links.append(
Expand Down Expand Up @@ -617,9 +594,7 @@ def parse_file(
start_time = timeit.default_timer()
filename = normalize_filename(filename)
taxonomy = self._create_taxonomy(filename, external_filenames)
self.parser_logger.info(
f"Parsing done in {timeit.default_timer() - start_time} seconds."
)
self.parser_logger.info(f"Parsing done in {timeit.default_timer() - start_time} seconds.")
self.parser_logger.info(
f"Found {len(taxonomy.entry_nodes) + len(taxonomy.other_nodes)} nodes"
)
Expand Down
7 changes: 4 additions & 3 deletions parser/openfoodfacts_taxonomy_parser/unparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ def get_all_nodes(self, project_label):
this function use the relationships between nodes"""
# This query first lists all the nodes in the "is_before" order
# then for each node in the path, it finds its parents
# and finally it returns the node and its parents (the parents are ordered in the same order as in the original file)
# and finally it returns the node and its parents
# (the parents are ordered in the same order as in the original file)
# Note: OPTIONAL MATCH is used to return nodes without parents
query = f"""
MATCH path = ShortestPath(
Expand Down Expand Up @@ -95,9 +96,9 @@ def iter_lines(self, project_label):
node = dict(node)
has_content = node["id"] not in ["__header__", "__footer__"]
# eventually add a blank line but in specific case
following_synonyms = node["id"].startswith(
following_synonyms = node["id"].startswith("synonyms") and previous_block_id.startswith(
"synonyms"
) and previous_block_id.startswith("synonyms")
)
following_stopwords = node["id"].startswith(
"stopwords"
) and previous_block_id.startswith("stopwords")
Expand Down
4 changes: 1 addition & 3 deletions parser/openfoodfacts_taxonomy_parser/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,7 @@ def normalize_text(
stopwords = stopwords[lang]
line_surrounded_by_char = char + line + char
for stopword in stopwords:
line_surrounded_by_char = line_surrounded_by_char.replace(
char + stopword + char, char
)
line_surrounded_by_char = line_surrounded_by_char.replace(char + stopword + char, char)
line = line_surrounded_by_char[1:-1]

return line
Expand Down
9 changes: 3 additions & 6 deletions parser/tests/integration/test_parse_unparse_integration.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
import pathlib

import pytest

from openfoodfacts_taxonomy_parser import parser, unparser

# taxonomy in text format : test.txt
TEST_TAXONOMY_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test.txt")
TEST_EXTERNAL_1_TXT = str(
pathlib.Path(__file__).parent.parent / "data" / "test_external1.txt"
)
TEST_EXTERNAL_2_TXT = str(
pathlib.Path(__file__).parent.parent / "data" / "test_external2.txt"
)
TEST_EXTERNAL_1_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test_external1.txt")
TEST_EXTERNAL_2_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test_external2.txt")


@pytest.fixture(autouse=True)
Expand Down
Loading

0 comments on commit 1ff1274

Please sign in to comment.