From a40b9fd69b046823e871c577ac2f696b46058382 Mon Sep 17 00:00:00 2001 From: Aadarsh A Date: Mon, 7 Nov 2022 19:01:47 +0530 Subject: [PATCH] fix: Add multiple labels in backend, import and export (#101) - Add multiple labels (to support branch and taxonomy labelling) in parser - Add multiple labels in unparser and tests - Changed API endpoints accordingly to support different branches and taxonomies - Wrapped entries.py in a class for easier use - Added import functionality - Added export functionality fixes: - #69 relates to: - #71 - #83 --- .env | 3 + backend/Dockerfile | 11 +- backend/editor/api.py | 233 +++-- backend/editor/entries.py | 797 +++++++++++------- backend/editor/exceptions.py | 54 +- backend/editor/github_functions.py | 67 ++ backend/editor/graph_db.py | 45 +- backend/editor/settings.py | 8 +- backend/requirements.txt | 3 +- docker-compose.yml | 5 +- docker/dev.yml | 11 +- .../normalizer.py | 29 +- .../openfoodfacts_taxonomy_parser/parser.py | 173 ++-- .../openfoodfacts_taxonomy_parser/unparser.py | 53 +- parser/setup.py | 25 + parser/tests/conftest.py | 3 +- .../test_parse_unparse_integration.py | 90 +- .../integration/test_parser_integration.py | 43 +- parser/tests/unit/test_parser_unit.py | 23 +- 19 files changed, 1144 insertions(+), 532 deletions(-) create mode 100644 backend/editor/github_functions.py rename {backend/editor => parser/openfoodfacts_taxonomy_parser}/normalizer.py (54%) create mode 100644 parser/setup.py diff --git a/.env b/.env index 6f337c6f..b9aa526e 100644 --- a/.env +++ b/.env @@ -1,6 +1,9 @@ # use windows path separator for compat COMPOSE_PATH_SEPARATOR=; COMPOSE_FILE=docker-compose.yml;docker/dev.yml + +DOCKER_TAG=dev + # domain name TAXONOMY_EDITOR_DOMAIN=taxonomy.localhost # exposition diff --git a/backend/Dockerfile b/backend/Dockerfile index e1632fc2..f13cf3c0 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -14,12 +14,17 @@ RUN groupadd -g $USER_GID off && \ mkdir -p /home/off && \ chown off:off -R /code /home/off -COPY ./requirements.txt /code/requirements.txt +COPY backend/requirements.txt /code/requirements.txt +COPY backend/editor /code/editor +COPY parser /parser + RUN pip3 install --no-cache-dir --upgrade -r /code/requirements.txt +# this is purely cosmetic +RUN ln -s /parser/openfoodfacts_taxonomy_parser /code/openfoodfacts_taxonomy_parser RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \ pip3 install --upgrade -r /code/requirements.txt USER off:off -COPY --chown=off:off ./editor /code/editor +COPY --chown=off:off ./backend/editor /code/editor -CMD ["uvicorn", "editor.api:app", "--host", "0.0.0.0", "--port", "80"] \ No newline at end of file +CMD ["uvicorn", "editor.api:app", "--host", "0.0.0.0", "--port", "80"] diff --git a/backend/editor/api.py b/backend/editor/api.py index 5ff59c6d..8fd4f720 100644 --- a/backend/editor/api.py +++ b/backend/editor/api.py @@ -2,22 +2,25 @@ Taxonomy Editor Backend API """ # Required imports -#------------------------------------------------------------------------# +#----------------------------------------------------------------------------# from datetime import datetime +import os # FastAPI -from fastapi import FastAPI, status, Response, Request, HTTPException +from fastapi import FastAPI, status, Response, Request, HTTPException, BackgroundTasks +from fastapi.responses import FileResponse from fastapi.middleware.cors import CORSMiddleware +# Custom exceptions +from .exceptions import GithubBranchExistsError, GithubUploadError + # Data model imports from .models import Header, Footer # DB helper imports from . import graph_db -from .entries import get_all_nodes, get_nodes, get_children, get_parents, get_label, full_text_search -from .entries import update_nodes, update_node_children -from .entries import create_node, add_node_to_end, add_node_to_beginning, delete_node -#------------------------------------------------------------------------# +from .entries import TaxonomyGraph +#----------------------------------------------------------------------------# app = FastAPI(title="Open Food Facts Taxonomy Editor API") @@ -68,6 +71,15 @@ def check_single(id): raise HTTPException(status_code=404, detail="Entry not found") elif len(id) > 1: raise HTTPException(status_code=500, detail="Multiple entries found") + +def file_cleanup(filepath): + """ + Helper function to delete a taxonomy file from local storage + """ + try: + os.remove(filepath) + except: + raise HTTPException(status_code=500, detail="Taxonomy file not found for deletion") # Get methods @@ -83,128 +95,204 @@ async def pong(response: Response): pong = datetime.now() return {"ping": "pong @ %s" % pong} -@app.get("/nodes") -async def findAllNodes(response: Response): +@app.get("/projects") +async def listAllProjects(response: Response): + """ + List all open projects created in the Taxonomy Editor + """ + # Listing all projects doesn't require a taoxnomy name or branch name + taxonony = TaxonomyGraph("", "") + result = list(taxonony.list_existing_projects()) + return result + +@app.get("/{taxonomy_name}/{branch}/nodes") +async def findAllNodes(response: Response, branch: str, taxonomy_name: str): """ Get all nodes within taxonomy """ - result = get_all_nodes("") + taxonomy = TaxonomyGraph(branch, taxonomy_name) + result = taxonomy.get_all_nodes("") allNodes = list(result) return allNodes -@app.get("/entry/{entry}") -async def findOneEntry(response: Response, entry: str): +@app.get("/{taxonomy_name}/{branch}/rootnodes") +async def findAllRootNodes(response: Response, branch: str, taxonomy_name: str): + """ + Get all root nodes within taxonomy + """ + taxonomy = TaxonomyGraph(branch, taxonomy_name) + result = taxonomy.get_all_root_nodes() + allRootNodes = list(result) + return allRootNodes + +@app.get("/{taxonomy_name}/{branch}/entry/{entry}") +async def findOneEntry(response: Response, branch: str, taxonomy_name: str, entry: str): """ Get entry corresponding to id within taxonomy """ - result = get_nodes("ENTRY", entry) + taxonomy = TaxonomyGraph(branch, taxonomy_name) + result = taxonomy.get_nodes("ENTRY", entry) oneEntry = list(result) check_single(oneEntry) return oneEntry[0] -@app.get("/entry/{entry}/parents") -async def findOneEntryParents(response: Response, entry: str): +@app.get("/{taxonomy_name}/{branch}/entry/{entry}/parents") +async def findOneEntryParents(response: Response, branch: str, taxonomy_name: str, entry: str): """ Get parents for a entry corresponding to id within taxonomy """ - result = get_parents(entry) + taxonomy = TaxonomyGraph(branch, taxonomy_name) + result = taxonomy.get_parents(entry) oneEntryParents = list(result) return oneEntryParents -@app.get("/entry/{entry}/children") -async def findOneEntryChildren(response: Response, entry: str): +@app.get("/{taxonomy_name}/{branch}/entry/{entry}/children") +async def findOneEntryChildren(response: Response, branch: str, taxonomy_name: str, entry: str): """ Get children for a entry corresponding to id within taxonomy """ - result = get_children(entry) + taxonomy = TaxonomyGraph(branch, taxonomy_name) + result = taxonomy.get_children(entry) oneEntryChildren = list(result) return oneEntryChildren -@app.get("/entry") -async def findAllEntries(response: Response): +@app.get("/{taxonomy_name}/{branch}/entry") +async def findAllEntries(response: Response, branch: str, taxonomy_name: str): """ Get all entries within taxonomy """ - result = get_all_nodes("ENTRY") + taxonomy = TaxonomyGraph(branch, taxonomy_name) + result = taxonomy.get_all_nodes("ENTRY") allEntries = list(result) return allEntries -@app.get("/synonym/{synonym}") -async def findOneSynonym(response: Response, synonym: str): +@app.get("/{taxonomy_name}/{branch}/synonym/{synonym}") +async def findOneSynonym(response: Response, branch: str, taxonomy_name: str, synonym: str): """ Get synonym corresponding to id within taxonomy """ - result = get_nodes("SYNONYMS", synonym) + taxonomy = TaxonomyGraph(branch, taxonomy_name) + result = taxonomy.get_nodes("SYNONYMS", synonym) oneSynonym = list(result) check_single(oneSynonym) return oneSynonym[0] -@app.get("/synonym") -async def findAllSynonyms(response: Response): +@app.get("/{taxonomy_name}/{branch}/synonym") +async def findAllSynonyms(response: Response, branch: str, taxonomy_name: str): """ Get all synonyms within taxonomy """ - result = get_all_nodes("SYNONYMS") + taxonomy = TaxonomyGraph(branch, taxonomy_name) + result = taxonomy.get_all_nodes("SYNONYMS") allSyononyms = list(result) return allSyononyms -@app.get("/stopword/{stopword}") -async def findOneStopword(response: Response, stopword: str): +@app.get("/{taxonomy_name}/{branch}/stopword/{stopword}") +async def findOneStopword(response: Response, branch: str, taxonomy_name: str, stopword: str): """ Get stopword corresponding to id within taxonomy """ - result = get_nodes("STOPWORDS", stopword) + taxonomy = TaxonomyGraph(branch, taxonomy_name) + result = taxonomy.get_nodes("STOPWORDS", stopword) oneStopword = list(result) check_single(oneStopword) return oneStopword[0] -@app.get("/stopword") -async def findAllStopwords(response: Response): +@app.get("/{taxonomy_name}/{branch}/stopword") +async def findAllStopwords(response: Response, branch: str, taxonomy_name: str): """ Get all stopwords within taxonomy """ - result = get_all_nodes("STOPWORDS") + taxonomy = TaxonomyGraph(branch, taxonomy_name) + result = taxonomy.get_all_nodes("STOPWORDS") allStopwords = list(result) return allStopwords -@app.get("/header") -async def findHeader(response: Response): +@app.get("/{taxonomy_name}/{branch}/header") +async def findHeader(response: Response, branch: str, taxonomy_name: str): """ Get __header__ within taxonomy """ - result = get_nodes("TEXT", "__header__") + taxonomy = TaxonomyGraph(branch, taxonomy_name) + result = taxonomy.get_nodes("TEXT", "__header__") header = list(result) return header[0] -@app.get("/footer") -async def findFooter(response: Response): +@app.get("/{taxonomy_name}/{branch}/footer") +async def findFooter(response: Response, branch: str, taxonomy_name: str): """ Get __footer__ within taxonomy """ - result = get_nodes("TEXT", "__footer__") + taxonomy = TaxonomyGraph(branch, taxonomy_name) + result = taxonomy.get_nodes("TEXT", "__footer__") footer = list(result) return footer[0] -@app.get("/search") -async def searchNode(response: Response, query: str): - result = full_text_search(query) +@app.get("/{taxonomy_name}/{branch}/search") +async def searchNode(response: Response, branch: str, taxonomy_name: str, query: str): + taxonomy = TaxonomyGraph(branch, taxonomy_name) + result = taxonomy.full_text_search(query) return result +@app.get("/{taxonomy_name}/{branch}/downloadexport") +async def exportToTextFile(response: Response, branch: str, taxonomy_name: str, background_tasks: BackgroundTasks): + taxonomy = TaxonomyGraph(branch, taxonomy_name) + file = taxonomy.file_export() + + # Add a background task for removing exported taxonomy file + background_tasks.add_task(file_cleanup, file) + return FileResponse(file) + +@app.get("/{taxonomy_name}/{branch}/githubexport") +async def exportToGithub(response: Response, branch: str, taxonomy_name: str, background_tasks: BackgroundTasks): + taxonomy = TaxonomyGraph(branch, taxonomy_name) + try: + url, file = taxonomy.github_export() + # Add a background task for removing exported taxonomy file + background_tasks.add_task(file_cleanup, file) + return url + + except GithubBranchExistsError: + raise HTTPException(status_code=500, detail="The Github branch already exists!") + + except GithubUploadError: + raise HTTPException(status_code=500, detail="Github upload error!") + # Post methods -@app.post("/nodes") -async def createNode(request: Request): +@app.post("/{taxonomy_name}/{branch}/import") +async def importFromGithub(request: Request, branch: str, taxonomy_name: str): + """ + Get taxonomy from Product Opener GitHub repository + """ + incomingData = await request.json() + description = incomingData["description"] + + taxonomy = TaxonomyGraph(branch, taxonomy_name) + if (not taxonomy.is_valid_branch_name()): + raise HTTPException(status_code=500, detail="Enter a valid branch name!") + if (taxonomy.does_project_exist()): + raise HTTPException(status_code=500, detail="Project already exists!") + if (not taxonomy.is_branch_unique()): + raise HTTPException(status_code=500, detail="Branch name should be unique!") + + result = taxonomy.import_from_github(description) + return result + +@app.post("/{taxonomy_name}/{branch}/nodes") +async def createNode(request: Request, branch: str, taxonomy_name: str): """ Creating a new node in a taxonomy """ + taxonomy = TaxonomyGraph(branch, taxonomy_name) incomingData = await request.json() id = incomingData["id"] main_language = incomingData["main_language"] @@ -213,87 +301,94 @@ async def createNode(request: Request): if (main_language == None): raise HTTPException(status_code=400, detail="Invalid main language code") - create_node(get_label(id), id, main_language) - if (get_label(id) == "ENTRY"): - add_node_to_end(get_label(id), id) + taxonomy.create_node(taxonomy.get_label(id), id, main_language) + if (taxonomy.get_label(id) == "ENTRY"): + taxonomy.add_node_to_end(taxonomy.get_label(id), id) else: - add_node_to_beginning(get_label(id), id) + taxonomy.add_node_to_beginning(taxonomy.get_label(id), id) -@app.post("/entry/{entry}") -async def editEntry(request: Request, entry: str): +@app.post("/{taxonomy_name}/{branch}/entry/{entry}") +async def editEntry(request: Request, branch: str, taxonomy_name: str, entry: str): """ Editing an entry in a taxonomy. New key-value pairs can be added, old key-value pairs can be updated. URL will be of format '/entry/' """ + taxonomy = TaxonomyGraph(branch, taxonomy_name) incomingData = await request.json() - result = update_nodes("ENTRY", entry, incomingData) + result = taxonomy.update_nodes("ENTRY", entry, incomingData) updatedEntry = list(result) return updatedEntry -@app.post("/entry/{entry}/children") -async def editEntryChildren(request: Request, entry: str): +@app.post("/{taxonomy_name}/{branch}/entry/{entry}/children") +async def editEntryChildren(request: Request, branch: str, taxonomy_name: str, entry: str): """ Editing an entry's children in a taxonomy. New children can be added, old children can be removed. URL will be of format '/entry//children' """ + taxonomy = TaxonomyGraph(branch, taxonomy_name) incomingData = await request.json() - result = update_node_children(entry, incomingData) + result = taxonomy.update_node_children(entry, incomingData) updatedChildren = list(result) return updatedChildren -@app.post("/synonym/{synonym}") -async def editSynonyms(request: Request, synonym: str): +@app.post("/{taxonomy_name}/{branch}/synonym/{synonym}") +async def editSynonyms(request: Request, branch: str, taxonomy_name: str, synonym: str): """ Editing a synonym in a taxonomy. New key-value pairs can be added, old key-value pairs can be updated. URL will be of format '/synonym/' """ + taxonomy = TaxonomyGraph(branch, taxonomy_name) incomingData = await request.json() - result = update_nodes("SYNONYMS", synonym, incomingData) + result = taxonomy.update_nodes("SYNONYMS", synonym, incomingData) updatedSynonym = list(result) return updatedSynonym -@app.post("/stopword/{stopword}") -async def editStopwords(request: Request, stopword: str): +@app.post("/{taxonomy_name}/{branch}/stopword/{stopword}") +async def editStopwords(request: Request, branch: str, taxonomy_name: str, stopword: str): """ Editing a stopword in a taxonomy. New key-value pairs can be added, old key-value pairs can be updated. URL will be of format '/stopword/' """ + taxonomy = TaxonomyGraph(branch, taxonomy_name) incomingData = await request.json() - result = update_nodes("STOPWORDS", stopword, incomingData) + result = taxonomy.update_nodes("STOPWORDS", stopword, incomingData) updatedStopword = list(result) return updatedStopword -@app.post("/header") -async def editHeader(incomingData: Header): +@app.post("/{taxonomy_name}/{branch}/header") +async def editHeader(incomingData: Header, branch: str, taxonomy_name: str): """ Editing the __header__ in a taxonomy. """ + taxonomy = TaxonomyGraph(branch, taxonomy_name) convertedData = incomingData.dict() - result = update_nodes("TEXT", "__header__", convertedData) + result = taxonomy.update_nodes("TEXT", "__header__", convertedData) updatedHeader = list(result) return updatedHeader -@app.post("/footer") -async def editFooter(incomingData: Footer): +@app.post("/{taxonomy_name}/{branch}/footer") +async def editFooter(incomingData: Footer, branch: str, taxonomy_name: str): """ Editing the __footer__ in a taxonomy. """ + taxonomy = TaxonomyGraph(branch, taxonomy_name) convertedData = incomingData.dict() - result = update_nodes("TEXT", "__footer__", convertedData) + result = taxonomy.update_nodes("TEXT", "__footer__", convertedData) updatedFooter = list(result) return updatedFooter # Delete methods -@app.delete("/nodes") -async def deleteNode(request: Request): +@app.delete("/{taxonomy_name}/{branch}/nodes") +async def deleteNode(request: Request, branch: str, taxonomy_name: str): """ Deleting given node from a taxonomy """ + taxonomy = TaxonomyGraph(branch, taxonomy_name) incomingData = await request.json() id = incomingData["id"] - delete_node(get_label(id), id) \ No newline at end of file + taxonomy.delete_node(taxonomy.get_label(id), id) \ No newline at end of file diff --git a/backend/editor/entries.py b/backend/editor/entries.py index 4e22f5a5..e8db5279 100644 --- a/backend/editor/entries.py +++ b/backend/editor/entries.py @@ -2,301 +2,518 @@ Database helper functions for API """ import re -from .graph_db import get_current_transaction # Neo4J transactions helper -from .normalizer import normalizing # Normalizing tags - -def get_label(id): - """ - Helper function for getting the label for a given id - """ - if (id.startswith('stopword')): return 'STOPWORDS' - elif (id.startswith('synonym')): return 'SYNONYMS' - elif (id.startswith('__header__') or id.startswith('__footer__')): return 'TEXT' - else: return 'ENTRY' - -def create_node(label, entry, main_language_code): - """ - Helper function used for creating a node with given id and label - """ - #Normalising new Node ID - - normalised_entry = normalizing(entry, main_language_code) - - query = [f"""CREATE (n:{label})\n"""] - params = {"id": normalised_entry} - - # Build all basic keys of a node - if (label == "ENTRY"): - canonical_tag = normalised_entry.split(":", 1)[1] - query.append(f""" SET n.main_language = $main_language_code """) # Required for only an entry - params["main_language_code"] = main_language_code - else: - canonical_tag = "" - - query.append(f""" SET n.id = $id """) - query.append(f""" SET n.tags_{main_language_code} = [$canonical_tag] """) - query.append(f""" SET n.preceding_lines = [] """) - - params["canonical_tag"] = canonical_tag - result = get_current_transaction().run(" ".join(query), params) - return result - -def add_node_to_end(label, entry): - """ - Helper function which adds an existing node to end of taxonomy - """ - # Delete relationship between current last node and __footer__ - query = f""" - MATCH (last_node)-[r:is_before]->(footer:TEXT) WHERE footer.id = "__footer__" DELETE r - RETURN last_node - """ - result = get_current_transaction().run(query) - end_node = result.data()[0]['last_node'] - end_node_label = get_label(end_node['id']) # Get current last node ID - - # Rebuild relationships by inserting incoming node at the end - query = [] - query = f""" - MATCH (new_node:{label}) WHERE new_node.id = $id - MATCH (last_node:{end_node_label}) WHERE last_node.id = $endnodeid - MATCH (footer:TEXT) WHERE footer.id = "__footer__" - CREATE (last_node)-[:is_before]->(new_node) - CREATE (new_node)-[:is_before]->(footer) - """ - result = get_current_transaction().run(query, {"id": entry, "endnodeid": end_node['id']}) - -def add_node_to_beginning(label, entry): - """ - Helper function which adds an existing node to beginning of taxonomy - """ - # Delete relationship between current first node and __header__ - query = f""" - MATCH (header:TEXT)-[r:is_before]->(first_node) WHERE header.id = "__header__" DELETE r - RETURN first_node - """ - result = get_current_transaction().run(query) - start_node = result.data()[0]['first_node'] - start_node_label = get_label(start_node['id']) # Get current first node ID - - # Rebuild relationships by inserting incoming node at the beginning - query= f""" - MATCH (new_node:{label}) WHERE new_node.id = $id - MATCH (first_node:{start_node_label}) WHERE first_node.id = $startnodeid - MATCH (header:TEXT) WHERE header.id = "__header__" - CREATE (new_node)-[:is_before]->(first_node) - CREATE (header)-[:is_before]->(new_node) - """ - result = get_current_transaction().run(query, {"id": entry, "startnodeid": start_node['id']}) - -def delete_node(label, entry): - """ - Helper function used for deleting a node with given id and label - """ - # Finding node to be deleted using node ID - query = f""" - // Find node to be deleted using node ID - MATCH (deleted_node:{label})-[:is_before]->(next_node) WHERE deleted_node.id = $id - MATCH (previous_node)-[:is_before]->(deleted_node) - // Remove node - DETACH DELETE (deleted_node) - // Rebuild relationships after deletion - CREATE (previous_node)-[:is_before]->(next_node) - """ - result = get_current_transaction().run(query, {"id": entry}) - return result - -def get_all_nodes(label): - """ - Helper function used for getting all nodes with/without given label - """ - qualifier = f":{label}" if label else "" - query = f""" - MATCH (n{qualifier}) RETURN n - """ - result = get_current_transaction().run(query) - return result - -def get_nodes(label, entry): - """ - Helper function used for getting the node with given id and label - """ - query = f""" - MATCH (n:{label}) WHERE n.id = $id - RETURN n - """ - result = get_current_transaction().run(query, {"id": entry}) - return result - -def get_parents(entry): - """ - Helper function used for getting node parents with given id - """ - query = f""" - MATCH (child_node:ENTRY)-[r:is_child_of]->(parent) WHERE child_node.id = $id - RETURN parent.id - """ - result = get_current_transaction().run(query, {"id": entry}) - return result - -def get_children(entry): - """ - Helper function used for getting node children with given id - """ - query = f""" - MATCH (child)-[r:is_child_of]->(parent_node:ENTRY) WHERE parent_node.id = $id - RETURN child.id - """ - result = get_current_transaction().run(query, {"id": entry}) - return result - -def update_nodes(label, entry, new_node_keys): - """ - Helper function used for updation of node with given id and label - """ - # Sanity check keys - for key in new_node_keys.keys(): - if not re.match(r"^\w+$", key) or key == "id": - raise ValueError("Invalid key: %s", key) +import tempfile + +import urllib.request # Sending requests +from .github_functions import GithubOperations # Github functions + +from .exceptions import TaxnonomyImportError +from .exceptions import TaxonomyParsingError, TaxonomyUnparsingError +from .exceptions import GithubUploadError, GithubBranchExistsError # Custom exceptions + +from .graph_db import get_current_transaction, get_current_session # Neo4J transactions helper +from .graph_db import TransactionCtx # Neo4J transactions context manager + +from openfoodfacts_taxonomy_parser import parser # Parser for taxonomies +from openfoodfacts_taxonomy_parser import unparser # Unparser for taxonomies +from openfoodfacts_taxonomy_parser import normalizer # Normalizing tags + +class TaxonomyGraph: + + """Class for database operations""" - # Get current node information and deleted keys - curr_node = get_nodes(label, entry).data()[0]['n'] - curr_node_keys = list(curr_node.keys()) - deleted_keys = (set(curr_node_keys) ^ set(new_node_keys)) - - # Check for keys having null/empty values - for key in curr_node_keys: - if (curr_node[key] == []) or (curr_node[key] == None): - deleted_keys.add(key) - - # Build query - query = [f"""MATCH (n:{label}) WHERE n.id = $id """] - - # Delete keys removed by user - for key in deleted_keys: - if key == "id": # Doesn't require to be deleted - continue - query.append(f"""\nREMOVE n.{key}\n""") - - # Adding normalized tags ids corresponding to entry tags - normalised_new_node_key = {} - for keys in new_node_keys.keys(): - if keys.startswith("tags_") and not keys.endswith("_str"): - if "_ids_" not in keys: - keys_language_code = keys.split('_', 1)[1] - normalised_value = [] - for values in new_node_keys[keys]: - normalised_value.append(normalizing(values, keys_language_code)) - normalised_new_node_key[keys] = normalised_value - normalised_new_node_key["tags_ids_"+keys_language_code] = normalised_value - else: - pass # we generate tags_ids, and ignore the one sent + def __init__(self, branch_name, taxonomy_name): + self.taxonomy_name = taxonomy_name + self.branch_name = branch_name + self.project_name = 'p_' + taxonomy_name + '_' + branch_name + + def get_label(self, id): + """ + Helper function for getting the label for a given id + """ + if (id.startswith('stopword')): return 'STOPWORDS' + elif (id.startswith('synonym')): return 'SYNONYMS' + elif (id.startswith('__header__') or id.startswith('__footer__')): return 'TEXT' + else: return 'ENTRY' + + def create_node(self, label, entry, main_language_code): + """ + Helper function used for creating a node with given id and label + """ + # Normalizing new Node ID + normalised_entry = normalizer.normalizing(entry, main_language_code) + + query = [f"""CREATE (n:{self.project_name}:{label})\n"""] + params = {"id": normalised_entry} + + # Build all basic keys of a node + if (label == "ENTRY"): + canonical_tag = normalised_entry.split(":", 1)[1] + query.append(f""" SET n.main_language = $main_language_code """) # Required for only an entry + params["main_language_code"] = main_language_code else: - # No need to normalise - normalised_new_node_key[keys] = new_node_keys[keys] - - # Update keys - for key in normalised_new_node_key.keys(): - query.append(f"""\nSET n.{key} = ${key}\n""") - - query.append(f"""RETURN n""") - - params = dict(normalised_new_node_key, id=entry) - result = get_current_transaction().run(" ".join(query), params) - return result - -def update_node_children(entry, new_children_ids): - """ - Helper function used for updation of node children with given id - """ - # Parse node ids from Neo4j Record object - current_children = [record["child.id"] for record in list(get_children(entry))] - deleted_children = set(current_children) - set(new_children_ids) - added_children = set(new_children_ids) - set(current_children) - - # Delete relationships - for child in deleted_children: - query = f""" - MATCH (deleted_child:ENTRY)-[rel:is_child_of]->(parent:ENTRY) - WHERE parent.id = $id AND deleted_child.id = $child - DELETE rel - """ - get_current_transaction().run(query, {"id": entry, "child": child}) - - # Create non-existing nodes - query = """MATCH (child:ENTRY) WHERE child.id in $ids RETURN child.id""" - existing_ids = [record['child.id'] for record in get_current_transaction().run(query, ids=list(added_children))] - to_create = added_children - set(existing_ids) - - # Normalising new children node ID - created_child_ids = [] - for child in to_create: - main_language_code = child.split(":", 1)[0] - created_node = create_node("ENTRY", child, main_language_code) - created_child_ids.append(created_node.id) + canonical_tag = "" + + query.append(f""" SET n.id = $id """) + query.append(f""" SET n.tags_{main_language_code} = [$canonical_tag] """) + query.append(f""" SET n.preceding_lines = [] """) + query.apppend(f""" RETURN n """) + + params["canonical_tag"] = canonical_tag + result = get_current_transaction().run(" ".join(query), params) + return result + + def parse_taxonomy(self, filename): + """ + Helper function to call the Open Food Facts Python Taxonomy Parser + """ + # Close current transaction to use the session variable in parser + get_current_transaction().commit() + + # Create parser object and pass current session to it + parser_object = parser.Parser(get_current_session()) + try: + # Parse taxonomy with given file name and branch name + parser_object(filename, self.branch_name, self.taxonomy_name) + return True + except: + raise TaxonomyParsingError() + + def import_from_github(self, description): + """ + Helper function to import a taxonomy from GitHub + """ + base_url = "https://raw.githubusercontent.com/openfoodfacts/openfoodfacts-server/main/taxonomies/" + filename = self.taxonomy_name + '.txt' + base_url += filename + try: + with tempfile.TemporaryDirectory(prefix="taxonomy-") as tmpdir: + + # File to save the downloaded taxonomy + filepath = f"{tmpdir}/{filename}" + + # Downloads and creates taxonomy file in current working directory + urllib.request.urlretrieve(base_url, filepath) + + status = self.parse_taxonomy(filepath) # Parse the taxonomy + + with TransactionCtx(): + self.create_project(description) # Creates a "project node" in neo4j + + return status + except: + raise TaxnonomyImportError() + + def dump_taxonomy(self): + """ + Helper function to create the txt file of a taxonomy + """ + # Create unparser object and pass current session to it + unparser_object = unparser.WriteTaxonomy(get_current_session()) + # Creates a unique file for dumping the taxonomy + filename = self.project_name + '.txt' + try: + # Parse taxonomy with given file name and branch name + unparser_object(filename, self.branch_name, self.taxonomy_name) + return filename + except Exception: + raise TaxonomyUnparsingError() - # TODO: We would prefer to add the node just after its parent entry - add_node_to_end("ENTRY", child) + def file_export(self): + """Export a taxonomy for download""" + # Close current transaction to use the session variable in unparser + get_current_transaction().commit() - # Stores result of last query executed - result = [] - for child in created_child_ids: - # Create new relationships if it doesn't exist - query = f""" - MATCH (parent:ENTRY), (new_child:ENTRY) WHERE parent.id = $id AND new_child.id = $child - MERGE (new_child)-[r:is_child_of]->(parent) + filepath = self.dump_taxonomy() + return filepath + + def github_export(self): + """Export a taxonomy to Github""" + # Close current transaction to use the session variable in unparser + get_current_transaction().commit() + + filepath = self.dump_taxonomy() + # Create a new transaction context + with TransactionCtx(): + result = self.export_to_github(filepath) + self.close_project() + return result + + def export_to_github(self, filename): + """ + Helper function to export a taxonomy to GitHub + """ + query = """MATCH (n:PROJECT) WHERE n.id = $project_name RETURN n.description""" + result = get_current_transaction().run(query, {"project_name": self.project_name}) + description = result.data()[0]['n.description'] + + github_object = GithubOperations(self.taxonomy_name, self.branch_name) + try: + github_object.checkout_branch() + except: + raise GithubBranchExistsError() + try: + github_object.update_file(filename) + pr_object = github_object.create_pr(description) + return (pr_object.html_url, filename) + except: + raise GithubUploadError() + + def does_project_exist(self): + """ + Helper function to check the existence of a project + """ + query = """MATCH (n:PROJECT) WHERE n.id = $project_name RETURN n""" + result = get_current_transaction().run(query, {"project_name" : self.project_name}) + if (result.data() == []): + return False + else: + return True + + def is_branch_unique(self): + """ + Helper function to check uniqueness of GitHub branch + """ + query = """MATCH (n:PROJECT) WHERE n.branch_name = $branch_name RETURN n""" + result = get_current_transaction().run(query, {"branch_name" : self.branch_name}) + + github_object = GithubOperations(self.taxonomy_name, self.branch_name) + current_branches = github_object.list_all_branches() + + if ((result.data() == []) and (self.branch_name not in current_branches)): + return True + else: + return False + + def is_valid_branch_name(self): + """ + Helper function to check if a branch name is valid + """ + return normalizer.normalizing(self.branch_name) == self.branch_name + + def create_project(self, description): + """ + Helper function to create a node with label "PROJECT" + """ + query = """ + CREATE (n:PROJECT) + SET n.id = $project_name + SET n.taxonomy_name = $taxonomy_name + SET n.branch_name = $branch_name + SET n.description = $description + SET n.status = $status + SET n.created_at = datetime() """ - result = get_current_transaction().run(query, {"id": entry, "child": child}) + params = { + 'project_name' : self.project_name, + 'taxonomy_name' : self.taxonomy_name, + 'branch_name' : self.branch_name, + 'description' : description, + 'status' : "OPEN" + } + get_current_transaction().run(query, params) - return result - -def full_text_search(text): - """ - Helper function used for searching a taxonomy - """ - # Escape special characters - normalized_text = re.sub(r"[^A-Za-z0-9_]", r" ", text) - normalized_id_text = normalizing(text) - - text_query_exact = "*" + normalized_text + '*' - text_query_fuzzy = normalized_text + "~" - text_id_query_fuzzy = normalized_id_text + "~" - text_id_query_exact = "*" + normalized_id_text + "*" - params = { - "text_query_fuzzy" : text_query_fuzzy, - "text_query_exact" : text_query_exact, - "text_id_query_fuzzy" : text_id_query_fuzzy, - "text_id_query_exact" : text_id_query_exact - } - - # Fuzzy search and wildcard (*) search on two indexes - # Fuzzy search has more priority, since it matches more close strings - # IDs are given slightly lower priority than tags in fuzzy search - query = """ - CALL { - CALL db.index.fulltext.queryNodes("nodeSearchIds", $text_id_query_fuzzy) - yield node, score as score_ - where score_ > 0 - return node, score_ * 3 as score - UNION - CALL db.index.fulltext.queryNodes("nodeSearchTags", $text_query_fuzzy) - yield node, score as score_ - where score_ > 0 - return node, score_ * 5 as score - UNION - CALL db.index.fulltext.queryNodes("nodeSearchIds", $text_id_query_exact) - yield node, score as score_ - where score_ > 0 - return node, score_ as score - UNION - CALL db.index.fulltext.queryNodes("nodeSearchTags", $text_query_exact) - yield node, score as score_ - where score_ > 0 - return node, score_ as score + def close_project(self): + """ + Helper function to close a Taxonomy Editor project and updates project status as "CLOSED" + """ + query = """ + MATCH (n:PROJECT) + WHERE n.id = $project_name + SET n.status = $status + """ + params = { + 'project_name' : self.project_name, + 'status' : "CLOSED" } - with node.id as node, score - RETURN node, sum(score) as score + get_current_transaction().run(query, params) + + def list_existing_projects(self): + """ + Helper function for listing all existing projects created in Taxonomy Editor + """ + query = """ + MATCH (n:PROJECT) + WHERE n.status = "OPEN" RETURN n + ORDER BY n.created_at + """ + result = get_current_transaction().run(query) + return result + + def add_node_to_end(self, label, entry): + """ + Helper function which adds an existing node to end of taxonomy + """ + # Delete relationship between current last node and __footer__ + query = f""" + MATCH (last_node)-[r:is_before]->(footer:{self.project_name}:TEXT) WHERE footer.id = "__footer__" DELETE r + RETURN last_node + """ + result = get_current_transaction().run(query) + end_node = result.data()[0]['last_node'] + end_node_label = self.get_label(end_node['id']) # Get current last node ID + + # Rebuild relationships by inserting incoming node at the end + query = [] + query = f""" + MATCH (new_node:{self.project_name}:{label}) WHERE new_node.id = $id + MATCH (last_node:{self.project_name}:{end_node_label}) WHERE last_node.id = $endnodeid + MATCH (footer:{self.project_name}:TEXT) WHERE footer.id = "__footer__" + CREATE (last_node)-[:is_before]->(new_node) + CREATE (new_node)-[:is_before]->(footer) + """ + result = get_current_transaction().run(query, {"id": entry, "endnodeid": end_node['id']}) + + def add_node_to_beginning(self, label, entry): + """ + Helper function which adds an existing node to beginning of taxonomy + """ + # Delete relationship between current first node and __header__ + query = f""" + MATCH (header:{self.project_name}:TEXT)-[r:is_before]->(first_node) WHERE header.id = "__header__" DELETE r + RETURN first_node + """ + result = get_current_transaction().run(query) + start_node = result.data()[0]['first_node'] + start_node_label = self.get_label(start_node['id']) # Get current first node ID + + # Rebuild relationships by inserting incoming node at the beginning + query= f""" + MATCH (new_node:{self.project_name}:{label}) WHERE new_node.id = $id + MATCH (first_node:{self.project_name}:{start_node_label}) WHERE first_node.id = $startnodeid + MATCH (header:{self.project_name}:TEXT) WHERE header.id = "__header__" + CREATE (new_node)-[:is_before]->(first_node) + CREATE (header)-[:is_before]->(new_node) + """ + result = get_current_transaction().run(query, {"id": entry, "startnodeid": start_node['id']}) + + def delete_node(self, label, entry): + """ + Helper function used for deleting a node with given id and label + """ + # Finding node to be deleted using node ID + query = f""" + // Find node to be deleted using node ID + MATCH (deleted_node:{self.project_name}:{label})-[:is_before]->(next_node) WHERE deleted_node.id = $id + MATCH (previous_node)-[:is_before]->(deleted_node) + // Remove node + DETACH DELETE (deleted_node) + // Rebuild relationships after deletion + CREATE (previous_node)-[:is_before]->(next_node) + """ + result = get_current_transaction().run(query, {"id": entry}) + return result + + def get_all_nodes(self, label): + """ + Helper function used for getting all nodes with/without given label + """ + qualifier = f":{label}" if label else "" + query = f""" + MATCH (n:{self.project_name}{qualifier}) RETURN n + """ + result = get_current_transaction().run(query) + return result + + def get_all_root_nodes(self): + """ + Helper function used for getting all root nodes in a taxonomy + """ + query = f""" + MATCH (n:{self.project_name}) WHERE NOT (n)-[:is_child_of]->() RETURN n + """ + result = get_current_transaction().run(query) + return result + + def get_nodes(self, label, entry): + """ + Helper function used for getting the node with given id and label + """ + query = f""" + MATCH (n:{self.project_name}:{label}) WHERE n.id = $id + RETURN n + """ + result = get_current_transaction().run(query, {"id": entry}) + return result + + def get_parents(self, entry): + """ + Helper function used for getting node parents with given id + """ + query = f""" + MATCH (child_node:{self.project_name}:ENTRY)-[r:is_child_of]->(parent) WHERE child_node.id = $id + RETURN parent.id + """ + result = get_current_transaction().run(query, {"id": entry}) + return result + + def get_children(self, entry): + """ + Helper function used for getting node children with given id + """ + query = f""" + MATCH (child)-[r:is_child_of]->(parent_node:{self.project_name}:ENTRY) WHERE parent_node.id = $id + RETURN child.id + """ + result = get_current_transaction().run(query, {"id": entry}) + return result + + def update_nodes(self, label, entry, new_node_keys): + """ + Helper function used for updation of node with given id and label + """ + # Sanity check keys + for key in new_node_keys.keys(): + if not re.match(r"^\w+$", key) or key == "id": + raise ValueError("Invalid key: %s", key) - ORDER BY score DESC - """ - result = [record["node"] for record in get_current_transaction().run(query, params)] - return result \ No newline at end of file + # Get current node information and deleted keys + curr_node = self.get_nodes(label, entry).data()[0]['n'] + curr_node_keys = list(curr_node.keys()) + deleted_keys = (set(curr_node_keys) ^ set(new_node_keys)) + + # Check for keys having null/empty values + for key in curr_node_keys: + if (curr_node[key] == []) or (curr_node[key] == None): + deleted_keys.add(key) + + # Build query + query = [f"""MATCH (n:{self.project_name}:{label}) WHERE n.id = $id """] + + # Delete keys removed by user + for key in deleted_keys: + if key == "id": # Doesn't require to be deleted + continue + query.append(f"""\nREMOVE n.{key}\n""") + + # Adding normalized tags ids corresponding to entry tags + normalised_new_node_keys = {} + for keys in new_node_keys.keys(): + if keys.startswith("tags_") and not keys.endswith("_str"): + if "_ids_" not in keys: + keys_language_code = keys.split('_', 1)[1] + normalised_value = [] + for values in new_node_keys[keys]: + normalised_value.append(normalizer.normalizing(values, keys_language_code)) + normalised_new_node_keys[keys] = normalised_value + normalised_new_node_keys["tags_ids_"+keys_language_code] = normalised_value + else: + pass # We generate tags_ids, and ignore the one sent + else: + # No need to normalise + normalised_new_node_keys[keys] = new_node_keys[keys] + + # Update keys + for key in normalised_new_node_keys.keys(): + query.append(f"""\nSET n.{key} = ${key}\n""") + + query.append(f"""RETURN n""") + + params = dict(normalised_new_node_keys, id=entry) + result = get_current_transaction().run(" ".join(query), params) + return result + + def update_node_children(self, entry, new_children_ids): + """ + Helper function used for updation of node children with given id + """ + # Parse node ids from Neo4j Record object + current_children = [record["child.id"] for record in list(self.get_children(entry))] + deleted_children = set(current_children) - set(new_children_ids) + added_children = set(new_children_ids) - set(current_children) + + # Delete relationships + for child in deleted_children: + query = f""" + MATCH (deleted_child:{self.project_name}:ENTRY)-[rel:is_child_of]->(parent:{self.project_name}:ENTRY) + WHERE parent.id = $id AND deleted_child.id = $child + DELETE rel + """ + get_current_transaction().run(query, {"id": entry, "child": child}) + + # Create non-existing nodes + query = f"""MATCH (child:{self.project_name}:ENTRY) WHERE child.id in $ids RETURN child.id""" + existing_ids = [record['child.id'] for record in get_current_transaction().run(query, ids=list(added_children))] + to_create = added_children - set(existing_ids) + + # Normalising new children node ID + created_child_ids = [] + + for child in to_create: + main_language_code = child.split(":", 1)[0] + created_node = self.create_node("ENTRY", child, main_language_code) + created_node_id = created_node.data()[0]['n']['id'] + created_child_ids.append(created_node_id) + + # TODO: We would prefer to add the node just after its parent entry + self.add_node_to_end("ENTRY", child) + + # Stores result of last query executed + result = [] + for child in created_child_ids: + # Create new relationships if it doesn't exist + query = f""" + MATCH (parent:{self.project_name}:ENTRY), (new_child:{self.project_name}:ENTRY) + WHERE parent.id = $id AND new_child.id = $child + MERGE (new_child)-[r:is_child_of]->(parent) + """ + result = get_current_transaction().run(query, {"id": entry, "child": child}) + + return result + + def full_text_search(self, text): + """ + Helper function used for searching a taxonomy + """ + # Escape special characters + normalized_text = re.sub(r"[^A-Za-z0-9_]", r" ", text) + normalized_id_text = normalizer.normalizing(text) + + id_index = self.project_name+'_SearchIds' + tags_index = self.project_name+'_SearchTags' + + text_query_exact = "*" + normalized_text + '*' + text_query_fuzzy = normalized_text + "~" + text_id_query_fuzzy = normalized_id_text + "~" + text_id_query_exact = "*" + normalized_id_text + "*" + params = { + "id_index" : id_index, + "tags_index" : tags_index, + "text_query_fuzzy" : text_query_fuzzy, + "text_query_exact" : text_query_exact, + "text_id_query_fuzzy" : text_id_query_fuzzy, + "text_id_query_exact" : text_id_query_exact + } + + # Fuzzy search and wildcard (*) search on two indexes + # Fuzzy search has more priority, since it matches more close strings + # IDs are given slightly lower priority than tags in fuzzy search + query = """ + CALL { + CALL db.index.fulltext.queryNodes($id_index, $text_id_query_fuzzy) + yield node, score as score_ + where score_ > 0 + return node, score_ * 3 as score + UNION + CALL db.index.fulltext.queryNodes($tags_index, $text_query_fuzzy) + yield node, score as score_ + where score_ > 0 + return node, score_ * 5 as score + UNION + CALL db.index.fulltext.queryNodes($id_index, $text_id_query_exact) + yield node, score as score_ + where score_ > 0 + return node, score_ as score + UNION + CALL db.index.fulltext.queryNodes($tags_index, $text_query_exact) + yield node, score as score_ + where score_ > 0 + return node, score_ as score + } + with node.id as node, score + RETURN node, sum(score) as score + + ORDER BY score DESC + """ + result = [record["node"] for record in get_current_transaction().run(query, params)] + return result \ No newline at end of file diff --git a/backend/editor/exceptions.py b/backend/editor/exceptions.py index ff7a1860..d221f3c0 100644 --- a/backend/editor/exceptions.py +++ b/backend/editor/exceptions.py @@ -1,8 +1,60 @@ +""" +Custom exceptions for Taxonomy Editor API +""" class TransactionMissingError(RuntimeError): """ Raised when attempting to run a query using null transaction context variable """ def __init__(self): - exception_message = f"Transaction context variable is null/missing" + exception_message = "Transaction context variable is null/missing" + return super().__init__(exception_message) + +class SessionMissingError(RuntimeError): + """ + Raised when attempting to run a query using null session context variable + """ + + def __init__(self): + exception_message = "Session context variable is null/missing" + return super().__init__(exception_message) + +class TaxnonomyImportError(RuntimeError): + """ + Raised when attempting to fetch a taxonomy from GitHub + """ + def __init__(self): + exception_message = "Unable to fetch the given taxonomy from GitHub" + return super().__init__(exception_message) + +class TaxonomyParsingError(RuntimeError): + """ + Raised when attempting to parse a taxonomy imported from GitHub + """ + def __init__(self): + exception_message = "Unable to parse the requested taxonomy file imported from GitHub" + return super().__init__(exception_message) + +class TaxonomyUnparsingError(RuntimeError): + """ + Raised when attempting to unparse a taxonomy exported from Neo4j + """ + def __init__(self): + exception_message = "Unable to unparse the requested taxonomy exported from Neo4j" + return super().__init__(exception_message) + +class GithubUploadError(RuntimeError): + """ + Raised when attempting to upload a taxonomy to Github + """ + def __init__(self): + exception_message = "Unable to upload the given taxonomy file to Github" + return super().__init__(exception_message) + +class GithubBranchExistsError(RuntimeError): + """ + Raised when attempting to create an existing branch in Github + """ + def __init__(self): + exception_message = "The new branch to be created already exists" return super().__init__(exception_message) diff --git a/backend/editor/github_functions.py b/backend/editor/github_functions.py new file mode 100644 index 00000000..5b9e8f86 --- /dev/null +++ b/backend/editor/github_functions.py @@ -0,0 +1,67 @@ +""" +Github helper functions for the Taxonomy Editor API +""" +from github import Github +from .settings import access_token, repo_owner # Github settings +from textwrap import dedent + +class GithubOperations: + + """Class for Github operations""" + + def __init__(self, taxonomy_name, branch_name): + self.taxonomy_name = taxonomy_name + self.branch_name = branch_name + self.repo = self.init_driver_and_repo() + + def init_driver_and_repo(self): + """ + Initalize connection to Github with an access token + """ + github_driver = Github(access_token) + repo = github_driver.get_repo(f"{repo_owner}/openfoodfacts-server") + return repo + + def list_all_branches(self): + """ + List of all current branches in the "openfoodfacts-server" repo + """ + result = list(self.repo.get_branches()) + all_branches = [branch.name for branch in result] + return all_branches + + def checkout_branch(self): + """ + Create a new branch in the "openfoodfacts-server" repo + """ + source_branch = self.repo.get_branch("main") + self.repo.create_git_ref(ref='refs/heads/'+self.branch_name, sha=source_branch.commit.sha) + + def update_file(self, filename): + """ + Update the taxonomy txt file edited by user using the Taxonomy Editor + """ + # Find taxonomy text file to be updated + github_filepath = f"taxonomies/{self.taxonomy_name}.txt" + commit_message = f"Update {self.taxonomy_name}.txt" + + current_file = self.repo.get_contents(github_filepath) + with open(filename, 'r') as f: + new_file_contents = f.read() + + # Update the file + self.repo.update_file(github_filepath, commit_message, new_file_contents, current_file.sha, branch=self.branch_name) + + def create_pr(self, description): + """ + Create a pull request to "openfoodfacts-server" repo + """ + title = f"taxonomy: Update {self.taxonomy_name} taxonomy" + body = dedent(f""" + ### What + This is a pull request automatically created using the Taxonomy Editor. + + ### Description + {description} + """) + return self.repo.create_pull(title=title, body=body, head=self.branch_name, base="main") \ No newline at end of file diff --git a/backend/editor/graph_db.py b/backend/editor/graph_db.py index 3b65518b..a0811939 100644 --- a/backend/editor/graph_db.py +++ b/backend/editor/graph_db.py @@ -1,24 +1,34 @@ +""" +Neo4J Transactions manager for DB operations +""" import contextlib -import contextvars -from neo4j import GraphDatabase # Interface with Neo4J -from . import settings # Neo4J settings -from .exceptions import TransactionMissingError # Custom exceptions +import contextvars # Used for creation of context vars + +from neo4j import GraphDatabase # Interface with Neo4J +from . import settings # Neo4J settings + +from .exceptions import SessionMissingError, TransactionMissingError # Custom exceptions txn = contextvars.ContextVar('txn') txn.set(None) +session = contextvars.ContextVar('session') +session.set(None) + @contextlib.contextmanager def TransactionCtx(): """ - Transaction context will set global transaction "txn" for the code in context - Transactions are automatically rollbacked if an exception occurs within the context + Transaction context will set global transaction "txn" for the code in context. + Transactions are automatically rollback if an exception occurs within the context. """ - global txn - with driver.session() as session: - with session.begin_transaction() as _txn: + global txn, session + with driver.session() as _session: + with _session.begin_transaction() as _txn: txn.set(_txn) - yield txn + session.set(_session) + yield txn, session txn.set(None) + session.set(None) def initialize_db(): """ @@ -35,7 +45,20 @@ def shutdown_db(): driver.close() def get_current_transaction(): + """ + Fetches transaction variable in current context to perform DB operations + """ curr_txn = txn.get() if (curr_txn == None): raise TransactionMissingError() - return curr_txn \ No newline at end of file + return curr_txn + + +def get_current_session(): + """ + Fetches session variable in current context to perform DB operations + """ + curr_session = session.get() + if (curr_session == None): + raise SessionMissingError() + return curr_session \ No newline at end of file diff --git a/backend/editor/settings.py b/backend/editor/settings.py index 6ece6931..ba1efce4 100644 --- a/backend/editor/settings.py +++ b/backend/editor/settings.py @@ -1,4 +1,8 @@ -# Settings for Neo4J - +""" +Settings for Neo4J +""" import os + uri = os.environ.get("NEO4J_URI", "bolt://localhost:7687") +access_token = os.environ.get("GITHUB_PAT", "") +repo_owner = os.environ.get("REPO_OWNER", "openfoodfacts") \ No newline at end of file diff --git a/backend/requirements.txt b/backend/requirements.txt index 3bd24f41..cd27575e 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -16,4 +16,5 @@ uvicorn==0.18.2 uvloop==0.16.0 watchfiles==0.16.0 websockets==10.3 -Unidecode==1.3.4 +PyGithub==1.56 +-e ../parser/ \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 12d587d8..4ba913ae 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -23,12 +23,9 @@ services: command: neo4j taxonomy_api: restart: ${RESTART_POLICY:-no} - build: backend + image: off-taxonomy-editor-api:${DOCKER_TAG} environment: NEO4J_URI: "bolt://neo4j:7687" - volumes: - # for now we are in development mode, mount backend directory as code/ - - ./backend/editor:/code/editor taxonomy_frontend: # this is the nginx frontend serving react static version or redirecting queries image: ghcr.io/openfoodfacts/taxonomy-editor/frontend:${TAG} diff --git a/docker/dev.yml b/docker/dev.yml index 04e47c48..3abcd1bd 100644 --- a/docker/dev.yml +++ b/docker/dev.yml @@ -3,6 +3,14 @@ version: "3.7" services: # in dev we want to use watch assets and recompile on the fly # also we want to build at start time in case some files changed, as we want to avoid recreating volumes + taxonomy_api: + build: + context: . + dockerfile: backend/Dockerfile + volumes: + # in development mode, mount code directory dynamically + - ./backend/editor:/code/editor + - ./parser:/parser taxonomy_node: image: taxonomy-editor/taxonomy_node:dev build: @@ -43,7 +51,8 @@ services: taxonomy-backend: image: taxonomy-editor/taxonomy_backend:dev build: - context: backend + context: . + dockerfile: backend/Dockerfile # align user id args: USER_UID: ${USER_UID:-1000} diff --git a/backend/editor/normalizer.py b/parser/openfoodfacts_taxonomy_parser/normalizer.py similarity index 54% rename from backend/editor/normalizer.py rename to parser/openfoodfacts_taxonomy_parser/normalizer.py index 97f482af..68e1e635 100644 --- a/backend/editor/normalizer.py +++ b/parser/openfoodfacts_taxonomy_parser/normalizer.py @@ -1,30 +1,35 @@ +""" +String normalizer +""" import re import unicodedata + import unidecode -def normalizing(line, lang="default"): - """normalize a string depending of the language code lang""" + +def normalizing(line, lang="default", char="-"): + """Normalize a string depending on the language code""" line = unicodedata.normalize("NFC", line) - # removing accent + # Removing accent if lang in ["fr", "ca", "es", "it", "nl", "pt", "sk", "en"]: - line = re.sub(r"[¢£¤¥§©ª®°²³µ¶¹º¼½¾×‰€™]", "-", line) + line = re.sub(r"[¢£¤¥§©ª®°²³µ¶¹º¼½¾×‰€™]", char, line) line = unidecode.unidecode(line) - # lower case except if language in list + # Lower case except if language in list if lang not in []: line = line.lower() - # changing unwanted character to "-" - line = re.sub(r"[\u0000-\u0027\u200b]", "-", line) - line = re.sub(r"&\w+;", "-", line) + # Changing unwanted character to "-" + line = re.sub(r"[\u0000-\u0027\u200b]", char, line) + line = re.sub(r"&\w+;", char, line) line = re.sub( r"[\s!\"#\$%&'()*+,\/:;<=>?@\[\\\]^_`{\|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿×ˆ˜–—‘’‚“”„†‡•…‰‹›€™\t]", # noqa: E501 - "-", + char, line, ) - # removing excess "-" - line = re.sub(r"-+", "-", line) - line = line.strip("-") + # Removing excess "-" + line = re.sub(r"-+", char, line) + line = line.strip(char) return line diff --git a/parser/openfoodfacts_taxonomy_parser/parser.py b/parser/openfoodfacts_taxonomy_parser/parser.py index 48ede832..82937d92 100644 --- a/parser/openfoodfacts_taxonomy_parser/parser.py +++ b/parser/openfoodfacts_taxonomy_parser/parser.py @@ -1,13 +1,13 @@ import logging +import os import re import sys -import unicodedata import iso639 -import unidecode from neo4j import GraphDatabase from .exception import DuplicateIDError +from .normalizer import normalizing def ellipsis(text, max=20): @@ -18,37 +18,37 @@ def ellipsis(text, max=20): class Parser: """Parse a taxonomy file and build a neo4j graph""" - def __init__(self, uri="bolt://localhost:7687"): - self.driver = GraphDatabase.driver(uri) - self.session = ( - self.driver.session() - ) # Doesn't create error even if there is no active database + def __init__(self, session): + self.session = session - def create_headernode(self, header): - """create the node for the header""" - query = """ - CREATE (n:TEXT {id: '__header__' }) + def create_headernode(self, header, multi_label): + """Create the node for the header""" + query = f""" + CREATE (n:{multi_label}:TEXT) + SET n.id = '__header__' SET n.preceding_lines= $header SET n.src_position= 1 """ self.session.run(query, header=header) - def create_node(self, data): - """run the query to create the node with data dictionary""" + def create_node(self, data, multi_label): + """Run the query to create the node with data dictionary""" position_query = """ + SET n.id = $id SET n.is_before = $is_before - SET n.preceding_lines= $preceding_lines - SET n.src_position= $src_position + SET n.preceding_lines = $preceding_lines + SET n.src_position = $src_position """ entry_query = "" if data["id"] == "__footer__": - id_query = " CREATE (n:TEXT {id: $id }) \n " + id_query = f" CREATE (n:{multi_label}:TEXT) \n " elif data["id"].startswith("synonyms"): - id_query = " CREATE (n:SYNONYMS {id: $id }) \n " + id_query = f" CREATE (n:{multi_label}:SYNONYMS) \n " elif data["id"].startswith("stopwords"): - id_query = " CREATE (n:STOPWORDS {id: $id }) \n " + id_query = f" CREATE (n:{multi_label}:STOPWORDS) \n " else: - id_query = " CREATE (n:ENTRY {id: $id , main_language : $main_language}) \n " + id_query = f" CREATE (n:{multi_label}:ENTRY) \n " + position_query += " SET n.main_language = $main_language " if data["parent_tag"]: entry_query += " SET n.parents = $parent_tag \n" for key in data: @@ -63,11 +63,20 @@ def create_node(self, data): self.session.run(query, data, is_before=self.is_before) def normalized_filename(self, filename): - """add the .txt extension if it is missing in the filename""" + """Add the .txt extension if it is missing in the filename""" return filename + (".txt" if (len(filename) < 4 or filename[-4:] != ".txt") else "") + def get_project_name(self, taxonomy_name, branch_name): + """Create a project name for given branch and taxonomy""" + return "p_" + taxonomy_name + "_" + branch_name + + def create_multi_label(self, taxonomy_name, branch_name): + """Create a combined label with taxonomy name and branch name""" + project_name = self.get_project_name(taxonomy_name, branch_name) + return project_name + ":" + ("t_" + taxonomy_name) + ":" + ("b_" + branch_name) + def file_iter(self, filename, start=0): - """generator to get the file line by line""" + """Generator to get the file line by line""" with open(filename, "r", encoding="utf8") as file: for line_number, line in enumerate(file): if line_number < start: @@ -87,35 +96,8 @@ def file_iter(self, filename, start=0): yield line_number, line yield line_number, "" # to end the last entry if not ended - def normalizing(self, line, lang="default"): - """normalize a string depending of the language code lang""" - line = unicodedata.normalize("NFC", line) - - # removing accent - if lang in ["fr", "ca", "es", "it", "nl", "pt", "sk", "en"]: - line = re.sub(r"[¢£¤¥§©ª®°²³µ¶¹º¼½¾×‰€™]", "-", line) - line = unidecode.unidecode(line) - - # lower case except if language in list - if lang not in []: - line = line.lower() - - # changing unwanted character to "-" - line = re.sub(r"[\u0000-\u0027\u200b]", "-", line) - line = re.sub(r"&\w+;", "-", line) - line = re.sub( - r"[\s!\"#\$%&'()*+,\/:;<=>?@\[\\\]^_`{\|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿×ˆ˜–—‘’‚“”„†‡•…‰‹›€™\t]", # noqa: E501 - "-", - line, - ) - - # removing excess "-" - line = re.sub(r"-+", "-", line) - line = line.strip("-") - return line - def remove_stopwords(self, lc, words): - """to remove the stopwords that were read at the beginning of the file""" + """Remove the stopwords that were read at the beginning of the file""" # First check if this language has stopwords if lc in self.stopwords: words_to_remove = self.stopwords[lc] @@ -128,20 +110,21 @@ def remove_stopwords(self, lc, words): return words def add_line(self, line): - """to get a normalized string but keeping the language code "lc:" , + """ + Get a normalized string but keeping the language code "lc:", used for id and parent tag """ lc, line = line.split(":", 1) new_line = lc + ":" - new_line += self.remove_stopwords(lc, self.normalizing(line, lc)) + new_line += self.remove_stopwords(lc, normalizing(line, lc)) return new_line def get_lc_value(self, line): - """to get the language code "lc" and a list of normalized values""" + """Get the language code "lc" and a list of normalized values""" lc, line = line.split(":", 1) new_line = [] for word in line.split(","): - new_line.append(self.remove_stopwords(lc, self.normalizing(word, lc))) + new_line.append(self.remove_stopwords(lc, normalizing(word, lc))) return lc, new_line def new_node_data(self): @@ -163,7 +146,8 @@ def set_data_id(self, data, id, line_number): return data def header_harvest(self, filename): - """to harvest the header (comment with #), + """ + Harvest the header (comment with #), it has its own function because some header has multiple blocks """ h = 0 @@ -316,7 +300,7 @@ def harvest(self, filename): tagsids_list = [] for word in line.split(","): tags_list.append(word.strip()) - word_normalized = self.remove_stopwords(lang, self.normalizing(word, lang)) + word_normalized = self.remove_stopwords(lang, normalizing(word, lang)) if word_normalized not in tagsids_list: # in case 2 normalized synonyms are the same tagsids_list.append(word_normalized) @@ -354,26 +338,25 @@ def harvest(self, filename): data["src_position"] = line_number + 1 - len(data["preceding_lines"]) yield data - def create_nodes(self, filename): + def create_nodes(self, filename, multi_label): """Adding nodes to database""" logging.info("Creating nodes") - filename = self.normalized_filename(filename) harvested_data = self.harvest(filename) - self.create_headernode(next(harvested_data)) + self.create_headernode(next(harvested_data), multi_label) for entry in harvested_data: - self.create_node(entry) + self.create_node(entry, multi_label) - def create_previous_link(self): + def create_previous_link(self, multi_label): logging.info("Creating 'is_before' links") - query = "MATCH(n) WHERE exists(n.is_before) return n.id,n.is_before" + query = f"MATCH(n:{multi_label}) WHERE exists(n.is_before) return n.id, n.is_before" results = self.session.run(query) for result in results: id = result["n.id"] id_previous = result["n.is_before"] - query = """ - MATCH(n) WHERE n.id = $id - MATCH(p) WHERE p.id= $id_previous + query = f""" + MATCH(n:{multi_label}) WHERE n.id = $id + MATCH(p:{multi_label}) WHERE p.id= $id_previous CREATE (p)-[r:is_before]->(n) RETURN r """ @@ -389,9 +372,9 @@ def create_previous_link(self): elif not relation[0]: logging.error("link not created between %s and %s", id, id_previous) - def parent_search(self): + def parent_search(self, multi_label): """Get the parent and the child to link""" - query = "match(n) WHERE size(n.parents)>0 return n.id, n.parents" + query = f"MATCH (n:{multi_label}:ENTRY) WHERE SIZE(n.parents)>0 RETURN n.id, n.parents" results = self.session.run(query) for result in results: id = result["n.id"] @@ -399,14 +382,14 @@ def parent_search(self): for parent in parent_list: yield parent, id - def create_child_link(self): + def create_child_link(self, multi_label): """Create the relations between nodes""" logging.info("Creating 'is_child_of' links") - for parent, child_id in self.parent_search(): + for parent, child_id in self.parent_search(multi_label): lc, parent_id = parent.split(":") - query = """ MATCH(p) WHERE $parent_id IN p.tags_ids_""" + lc - query += """ - MATCH(c) WHERE c.id= $child_id + query = f""" MATCH (p:{multi_label}:ENTRY) WHERE $parent_id IN p.tags_ids_""" + lc + query += f""" + MATCH (c:{multi_label}) WHERE c.id= $child_id CREATE (c)-[r:is_child_of]->(p) RETURN r """ @@ -418,27 +401,32 @@ def delete_used_properties(self): query = "MATCH (n) SET n.is_before = null, n.parents = null" self.session.run(query) - def create_fulltext_index(self): - query = """ - CREATE FULLTEXT INDEX nodeSearchIds FOR (n:ENTRY) ON EACH [n.id] - OPTIONS {indexConfig: {`fulltext.analyzer`: 'keyword'}} - """ - self.session.run(query) + def create_fulltext_index(self, taxonomy_name, branch_name): + """Create indexes for search""" + project_name = self.get_project_name(taxonomy_name, branch_name) + query = [ + f"""CREATE FULLTEXT INDEX {project_name+'_SearchIds'} IF NOT EXISTS + FOR (n:{project_name}) ON EACH [n.id]\n""" + ] + query.append("""OPTIONS {indexConfig: {`fulltext.analyzer`: 'keyword'}}""") + self.session.run("".join(query)) language_codes = [lang.alpha2 for lang in list(iso639.languages) if lang.alpha2 != ""] tags_prefixed_lc = ["n.tags_" + lc + "_str" for lc in language_codes] tags_prefixed_lc = ", ".join(tags_prefixed_lc) - query = ( - f"""CREATE FULLTEXT INDEX nodeSearchTags FOR (n:ENTRY) ON EACH [{tags_prefixed_lc}]""" - ) + query = f"""CREATE FULLTEXT INDEX {project_name+'_SearchTags'} IF NOT EXISTS + FOR (n:{project_name}) ON EACH [{tags_prefixed_lc}]""" self.session.run(query) - def __call__(self, filename): - """process the file""" - self.create_nodes(filename) - self.create_child_link() - self.create_previous_link() - self.create_fulltext_index() + def __call__(self, filename, branch_name, taxonomy_name): + """Process the file""" + filename = self.normalized_filename(filename) + branch_name = normalizing(branch_name, char="_") + multi_label = self.create_multi_label(taxonomy_name, branch_name) + self.create_nodes(filename, multi_label) + self.create_child_link(multi_label) + self.create_previous_link(multi_label) + self.create_fulltext_index(taxonomy_name, branch_name) # self.delete_used_properties() @@ -447,5 +435,14 @@ def __call__(self, filename): handlers=[logging.FileHandler(filename="parser.log", encoding="utf-8")], level=logging.INFO ) filename = sys.argv[1] if len(sys.argv) > 1 else "test" - parse = Parser() - parse(filename) + branch_name = sys.argv[2] if len(sys.argv) > 1 else "branch" + taxonomy_name = sys.argv[3] if len(sys.argv) > 1 else filename.rsplit(".", 1)[0] + + # Initialize neo4j + uri = os.environ.get("NEO4J_URI", "bolt://localhost:7687") + driver = GraphDatabase.driver(uri) + session = driver.session() + + # Pass session variable to parser object + parse = Parser(session) + parse(filename, branch_name, taxonomy_name) diff --git a/parser/openfoodfacts_taxonomy_parser/unparser.py b/parser/openfoodfacts_taxonomy_parser/unparser.py index 3a319b9c..315fb77c 100644 --- a/parser/openfoodfacts_taxonomy_parser/unparser.py +++ b/parser/openfoodfacts_taxonomy_parser/unparser.py @@ -1,25 +1,38 @@ +import os +import sys + from neo4j import GraphDatabase +from .normalizer import normalizing + class WriteTaxonomy: """Write the taxonomy of the neo4j database into a file""" - def __init__(self, uri="bolt://localhost:7687"): - self.driver = GraphDatabase.driver(uri) - # Doesn't create error even if there is no active database - self.session = self.driver.session() + def __init__(self, session): + self.session = session def normalized_filename(self, filename): """add the .txt extension if it is missing in the filename""" return filename + (".txt" if (len(filename) < 4 or filename[-4:] != ".txt") else "") - def get_all_nodes(self): + def get_project_name(self, taxonomy_name, branch_name): + """Create a project name for given branch and taxonomy""" + return "p_" + taxonomy_name + "_" + branch_name + + def create_multi_label(self, taxonomy_name, branch_name): + """Create a combined label with taxonomy name and branch name""" + project_name = self.get_project_name(taxonomy_name, branch_name) + return project_name + ":" + ("t_" + taxonomy_name) + ":" + ("b_" + branch_name) + + def get_all_nodes(self, multi_label): """query the database and yield each node with its parents, this function use the relationships between nodes""" - query = """ + query = f""" MATCH path = ShortestPath( - (h:TEXT{id:"__header__"})-[:is_before*]->(f:TEXT{id:"__footer__"}) + (h:{multi_label}:TEXT)-[:is_before*]->(f:{multi_label}:TEXT) ) + WHERE h.id="__header__" AND f.id="__footer__" UNWIND nodes(path) AS n RETURN n , [(n)-[:is_child_of]->(m) | m ] """ @@ -71,9 +84,9 @@ def get_parents_lines(self, parents): parent_id = parent["tags_" + lc][0] yield "<" + lc + ":" + parent_id - def iter_lines(self): + def iter_lines(self, multi_label): previous_block_id = "" - for node, parents in self.get_all_nodes(): + for node, parents in self.get_all_nodes(multi_label): node = dict(node) has_content = node["id"] not in ["__header__", "__footer__"] # eventually add a blank line but in specific case @@ -118,14 +131,24 @@ def rewrite_file(self, filename, lines): for line in lines: file.write(line + "\n") - def __call__(self, filename): - lines = self.iter_lines() + def __call__(self, filename, branch_name, taxonomy_name): + filename = self.normalized_filename(filename) + branch_name = normalizing(branch_name, char="_") + multi_label = self.create_multi_label(taxonomy_name, branch_name) + lines = self.iter_lines(multi_label) self.rewrite_file(filename, lines) if __name__ == "__main__": - import sys - filename = sys.argv[1] if len(sys.argv) > 1 else "test" - write = WriteTaxonomy() - write(filename) + branch_name = sys.argv[2] if len(sys.argv) > 1 else "branch" + taxonomy_name = sys.argv[3] if len(sys.argv) > 1 else filename.rsplit(".", 1)[0] + + # Initialize neo4j + uri = os.environ.get("NEO4J_URI", "bolt://localhost:7687") + driver = GraphDatabase.driver(uri) + session = driver.session() + + # Pass session variable to unparser object + write = WriteTaxonomy(session) + write(filename, branch_name, taxonomy_name) diff --git a/parser/setup.py b/parser/setup.py new file mode 100644 index 00000000..33f58552 --- /dev/null +++ b/parser/setup.py @@ -0,0 +1,25 @@ +from setuptools import setup + +requires = [ + req.replace("==", ">=") for req in open("requirements.txt").read().split("\n") if req.strip() +] +test_requirements = [ + req.replace("==", ">=") + for req in open("requirements-test.txt").read().split("\n") + if req.strip() +] + +setup( + name="openfoodfacts_taxonomy_parser", + version="0.1.0", + description="Taxonomy Parser written in Python for Open Food Facts", + author="Pierre Slamich", + author_email="pierre@openfoodfacts.org", + url="https://world.openfoodfacts.org", + packages=["openfoodfacts_taxonomy_parser"], + package_dir={"openfoodfacts_taxonomy_parser": "openfoodfacts_taxonomy_parser"}, + install_requires=requires, + extras_require={}, + test_suite="tests", + tests_require=test_requirements, +) diff --git a/parser/tests/conftest.py b/parser/tests/conftest.py index 1e956906..cf53d30b 100644 --- a/parser/tests/conftest.py +++ b/parser/tests/conftest.py @@ -1,3 +1,4 @@ +import os import time import pytest @@ -8,7 +9,7 @@ @pytest.fixture def neo4j(): """waiting for neo4j to be ready""" - uri = "bolt://localhost:7687" + uri = os.environ.get("NEO4J_URI", "bolt://localhost:7687") driver = GraphDatabase.driver(uri) session = driver.session() connected = False diff --git a/parser/tests/integration/test_parse_unparse_integration.py b/parser/tests/integration/test_parse_unparse_integration.py index 25840dd2..3ababa3a 100644 --- a/parser/tests/integration/test_parse_unparse_integration.py +++ b/parser/tests/integration/test_parse_unparse_integration.py @@ -10,28 +10,47 @@ @pytest.fixture(autouse=True) def test_setup(neo4j): - # delete all the nodes and relations in the database - query = "MATCH (n) DETACH DELETE n" + # delete all the nodes, relations and search indexes in the database + query = "MATCH (n:p_test_branch:t_test:b_branch) DETACH DELETE n" neo4j.session().run(query) + query = "DROP INDEX p_test_branch_SearchIds IF EXISTS" + neo4j.session().run(query) + query = "DROP INDEX p_test_branch_SearchTags IF EXISTS" + neo4j.session().run(query) + + query1 = "MATCH (n:p_test_branch1:t_test:b_branch1) DETACH DELETE n" + neo4j.session().run(query1) + query1 = "DROP INDEX p_test_branch1_SearchIds IF EXISTS" + neo4j.session().run(query1) + query1 = "DROP INDEX p_test_branch1_SearchTags IF EXISTS" + neo4j.session().run(query1) + query2 = "MATCH (n:p_test_branch2:t_test:b_branch2) DETACH DELETE n" + neo4j.session().run(query2) + query2 = "DROP INDEX p_test_branch2_SearchIds IF EXISTS" + neo4j.session().run(query2) + query2 = "DROP INDEX p_test_branch2_SearchTags IF EXISTS" + neo4j.session().run(query2) -def test_round_trip(): + +def test_round_trip(neo4j): """test parsing and dumping back a taxonomy""" - test_parser = parser.Parser() - session = test_parser.session + session = neo4j.session() + test_parser = parser.Parser(session) # parse taxonomy - test_parser(TEST_TAXONOMY_TXT) + test_parser(TEST_TAXONOMY_TXT, "branch", "test") # just quick check it runs ok with total number of nodes - query = "MATCH (n) RETURN COUNT(*)" + query = "MATCH (n:p_test_branch:t_test:b_branch) RETURN COUNT(*)" result = session.run(query) number_of_nodes = result.value()[0] assert number_of_nodes == 13 - session.close() # dump taxonomy back - test_dumper = unparser.WriteTaxonomy() - lines = list(test_dumper.iter_lines()) + test_dumper = unparser.WriteTaxonomy(session) + lines = list(test_dumper.iter_lines("p_test_branch:t_test:b_branch")) + + session.close() original_lines = [line.rstrip("\n") for line in open(TEST_TAXONOMY_TXT)] # expected result is close to original file with a few tweaks @@ -51,3 +70,54 @@ def test_round_trip(): expected_lines.append(line) assert expected_lines == lines + + +def test_two_branch_round_trip(neo4j): + """test parsing and dumping the same taxonomy with two different branches""" + + session = neo4j.session() + + test_parser = parser.Parser(session) + + # parse taxonomy with branch1 + test_parser(TEST_TAXONOMY_TXT, "branch1", "test") + # parse taxonomy with branch2 + test_parser(TEST_TAXONOMY_TXT, "branch2", "test") + + # just quick check it runs ok with total number of nodes + query = "MATCH (n:p_test_branch1:t_test:b_branch1) RETURN COUNT(*)" + result = session.run(query) + number_of_nodes = result.value()[0] + assert number_of_nodes == 13 + + query = "MATCH (n:p_test_branch2:t_test:b_branch2) RETURN COUNT(*)" + result = session.run(query) + number_of_nodes = result.value()[0] + assert number_of_nodes == 13 + + # dump taxonomy back + test_dumper = unparser.WriteTaxonomy(session) + lines_branch1 = list(test_dumper.iter_lines("p_test_branch1:t_test:b_branch1")) + lines_branch2 = list(test_dumper.iter_lines("p_test_branch2:t_test:b_branch2")) + + session.close() + + original_lines = [line.rstrip("\n") for line in open(TEST_TAXONOMY_TXT)] + # expected result is close to original file with a few tweaks + expected_lines = [] + for line in original_lines: + # first tweak: spaces between stopwords + if line.startswith("stopwords:fr: aux"): + line = "stopwords:fr:aux, au, de, le, du, la, a, et" + # second tweak: renaming parent + elif line.startswith("(p:p_test_branch:t_test:b_branch) + RETURN c.id, p.id + """ results = session.run(query) created_pairs = results.values() @@ -125,8 +134,11 @@ def test_calling(): assert pair in expected_pairs # Order link test - test_parser.create_previous_link() - query = "MATCH (n)-[:is_before]->(p) RETURN n.id, p.id " + test_parser.create_previous_link("p_test_branch:t_test:b_branch") + query = """ + MATCH (n:p_test_branch:t_test:b_branch)-[:is_before]->(p:p_test_branch:t_test:b_branch) + RETURN n.id, p.id + """ results = session.run(query) created_pairs = results.values() @@ -151,3 +163,4 @@ def test_calling(): ] for pair in created_pairs: assert pair in expected_pairs + session.close() diff --git a/parser/tests/unit/test_parser_unit.py b/parser/tests/unit/test_parser_unit.py index a8f01356..25757b8d 100644 --- a/parser/tests/unit/test_parser_unit.py +++ b/parser/tests/unit/test_parser_unit.py @@ -1,13 +1,15 @@ import pathlib -from openfoodfacts_taxonomy_parser import parser +from openfoodfacts_taxonomy_parser import normalizer, parser # taxonomy in text format : test.txt TEST_TAXONOMY_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test.txt") -def test_normalized_filename(): - x = parser.Parser() +def test_normalized_filename(neo4j): + session = neo4j.session() + + x = parser.Parser(session) normalizer = x.normalized_filename name = normalizer("test") assert name == "test.txt" @@ -15,25 +17,28 @@ def test_normalized_filename(): assert name == "test.txt" name = normalizer("t") assert name == "t.txt" + session.close() -def test_fileiter(): - x = parser.Parser() +def test_fileiter(neo4j): + session = neo4j.session() + x = parser.Parser(session) file = x.file_iter(TEST_TAXONOMY_TXT) + for counter, (_, line) in enumerate(file): assert line == "" or line[0] == "#" or ":" in line if counter == 26: assert line == "carbon_footprint_fr_foodges_value:fr:10" assert counter == 37 + session.close() def test_normalizing(): - x = parser.Parser() text = "Numéro #1, n°1 des ¾ des Français*" - text = x.normalizing(text, "fr") + text = normalizer.normalizing(text, "fr") assert text == "numero-1-n-1-des-des-francais" text = "Randôm Languäge wìth àccénts" - normal_text = x.normalizing(text, "fr") + normal_text = normalizer.normalizing(text, "fr") assert normal_text == "random-language-with-accents" - normal_text = x.normalizing(text, "de") + normal_text = normalizer.normalizing(text, "de") assert normal_text == "randôm-languäge-wìth-àccénts"