Fix using id_hash_keys as pipeline params (#2717)

* Fix using id_hash_keys as pipeline params * Update Documentation & Code Style * add tests Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
deepset-ai · Jun 24, 2022 · 1168f63 · 1168f63
1 parent a084a98
commit 1168f63
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 6 deletions.
diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md
@@ -86,7 +86,7 @@ Validate if the language of the text is one of valid languages.
 #### BaseConverter.run
 
 ```python
-def run(file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, known_ligatures: Dict[str, str] = KNOWN_LIGATURES, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8")
+def run(file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, known_ligatures: Dict[str, str] = KNOWN_LIGATURES, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None)
 ```
 
 Extract text from a file.
@@ -114,6 +114,10 @@ This option can be used to add test for encoding errors. If the extracted text i
 not one of the valid languages, then it might likely be encoding error resulting
 in garbled text.
 - `encoding`: Select the file encoding (default is `UTF-8`)
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
 
 <a id="docx"></a>
 

diff --git a/docs/_src/api/api/preprocessor.md b/docs/_src/api/api/preprocessor.md
@@ -16,7 +16,7 @@ class BasePreProcessor(BaseComponent)
 
 ```python
 @abstractmethod
-def process(documents: Union[dict, Document, List[Union[dict, Document]]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[Document]
+def process(documents: Union[dict, Document, List[Union[dict, Document]]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True, id_hash_keys: Optional[List[str]] = None) -> List[Document]
 ```
 
 Perform document cleaning and splitting. Takes a single Document or a List of Documents as input and returns a

diff --git a/haystack/nodes/file_converter/base.py b/haystack/nodes/file_converter/base.py
@@ -137,6 +137,7 @@ def run(  # type: ignore
         known_ligatures: Dict[str, str] = KNOWN_LIGATURES,
         valid_languages: Optional[List[str]] = None,
         encoding: Optional[str] = "UTF-8",
+        id_hash_keys: Optional[List[str]] = None,
     ):
         """
         Extract text from a file.
@@ -162,6 +163,10 @@ def run(  # type: ignore
                                 not one of the valid languages, then it might likely be encoding error resulting
                                 in garbled text.
         :param encoding: Select the file encoding (default is `UTF-8`)
+        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
+            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+            In this case the id will be generated by using the content and the defined metadata.
         """
 
         if isinstance(file_paths, Path):
@@ -178,6 +183,7 @@ def run(  # type: ignore
                 remove_numeric_tables=remove_numeric_tables,
                 valid_languages=valid_languages,
                 encoding=encoding,
+                id_hash_keys=id_hash_keys,
             ):
                 documents.append(doc)
 
@@ -192,14 +198,20 @@ def run(  # type: ignore
 
     def run_batch(  # type: ignore
         self,
-        file_paths: Union[Path, List[Path]],  # type: ignore
-        meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,  # type: ignore
-        remove_numeric_tables: Optional[bool] = None,  # type: ignore
-        valid_languages: Optional[List[str]] = None,  # type: ignore
+        file_paths: Union[Path, List[Path]],
+        meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,
+        remove_numeric_tables: Optional[bool] = None,
+        known_ligatures: Dict[str, str] = KNOWN_LIGATURES,
+        valid_languages: Optional[List[str]] = None,
+        encoding: Optional[str] = "UTF-8",
+        id_hash_keys: Optional[List[str]] = None,
     ):
         return self.run(
             file_paths=file_paths,
             meta=meta,
             remove_numeric_tables=remove_numeric_tables,
+            known_ligatures=known_ligatures,
             valid_languages=valid_languages,
+            encoding=encoding,
+            id_hash_keys=id_hash_keys,
         )
diff --git a/haystack/nodes/preprocessor/base.py b/haystack/nodes/preprocessor/base.py
@@ -20,6 +20,7 @@ def process(
         split_length: Optional[int] = 1000,
         split_overlap: Optional[int] = None,
         split_respect_sentence_boundary: Optional[bool] = True,
+        id_hash_keys: Optional[List[str]] = None,
     ) -> List[Document]:
         """
         Perform document cleaning and splitting. Takes a single Document or a List of Documents as input and returns a
@@ -59,6 +60,7 @@ def run(  # type: ignore
         split_length: Optional[int] = None,
         split_overlap: Optional[int] = None,
         split_respect_sentence_boundary: Optional[bool] = None,
+        id_hash_keys: Optional[List[str]] = None,
     ):
         processed_documents = self.process(
             documents=documents,
@@ -69,6 +71,7 @@ def run(  # type: ignore
             split_length=split_length,
             split_overlap=split_overlap,
             split_respect_sentence_boundary=split_respect_sentence_boundary,
+            id_hash_keys=id_hash_keys,
         )
         result = {"documents": processed_documents}
         return result, "output_1"
@@ -83,6 +86,7 @@ def run_batch(  # type: ignore
         split_length: Optional[int] = None,
         split_overlap: Optional[int] = None,
         split_respect_sentence_boundary: Optional[bool] = None,
+        id_hash_keys: Optional[List[str]] = None,
     ):
         return self.run(
             documents=documents,
@@ -93,4 +97,5 @@ def run_batch(  # type: ignore
             split_length=split_length,
             split_overlap=split_overlap,
             split_respect_sentence_boundary=split_respect_sentence_boundary,
+            id_hash_keys=id_hash_keys,
         )
diff --git a/test/nodes/test_file_converter.py b/test/nodes/test_file_converter.py
@@ -13,6 +13,7 @@
     TikaConverter,
     AzureConverter,
     ParsrConverter,
+    TextConverter,
 )
 
 from ..conftest import SAMPLES_PATH
@@ -172,3 +173,18 @@ def test_parsr_converter():
     assert docs[1].content_type == "text"
     assert docs[1].content.startswith("A sample PDF ﬁle")
     assert docs[1].content.endswith("Page 4 of Sample PDF\n… the page 3 is empty.")
+
+
+def test_id_hash_keys_from_pipeline_params():
+    doc_path = SAMPLES_PATH / "docs" / "doc_1.txt"
+    meta_1 = {"key": "a"}
+    meta_2 = {"key": "b"}
+    meta = [meta_1, meta_2]
+
+    converter = TextConverter()
+    output, _ = converter.run(file_paths=[doc_path, doc_path], meta=meta, id_hash_keys=["content", "meta"])
+    documents = output["documents"]
+    unique_ids = set(d.id for d in documents)
+
+    assert len(documents) == 2
+    assert len(unique_ids) == 2
diff --git a/test/nodes/test_preprocessor.py b/test/nodes/test_preprocessor.py
@@ -113,3 +113,17 @@ def test_remove_substrings():
     assert "🪲" not in documents[0].content
     assert "whitespace" in documents[0].content
     assert "✨" in documents[0].content
+
+
+def test_id_hash_keys_from_pipeline_params():
+    document_1 = Document(content="This is a document.", meta={"key": "a"})
+    document_2 = Document(content="This is a document.", meta={"key": "b"})
+    assert document_1.id == document_2.id
+
+    preprocessor = PreProcessor(split_length=2, split_respect_sentence_boundary=False)
+    output, _ = preprocessor.run(documents=[document_1, document_2], id_hash_keys=["content", "meta"])
+    documents = output["documents"]
+    unique_ids = set(d.id for d in documents)
+
+    assert len(documents) == 4
+    assert len(unique_ids) == 4