Skip to content

Commit

Permalink
Fix using id_hash_keys as pipeline params (#2717)
Browse files Browse the repository at this point in the history
* Fix using id_hash_keys as pipeline params

* Update Documentation & Code Style

* add tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
tstadel and github-actions[bot] authored Jun 24, 2022
1 parent a084a98 commit 1168f63
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 6 deletions.
6 changes: 5 additions & 1 deletion docs/_src/api/api/file_converter.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ Validate if the language of the text is one of valid languages.
#### BaseConverter.run

```python
def run(file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, known_ligatures: Dict[str, str] = KNOWN_LIGATURES, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8")
def run(file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, known_ligatures: Dict[str, str] = KNOWN_LIGATURES, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None)
```

Extract text from a file.
Expand Down Expand Up @@ -114,6 +114,10 @@ This option can be used to add test for encoding errors. If the extracted text i
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
- `encoding`: Select the file encoding (default is `UTF-8`)
- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.

<a id="docx"></a>

Expand Down
2 changes: 1 addition & 1 deletion docs/_src/api/api/preprocessor.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class BasePreProcessor(BaseComponent)

```python
@abstractmethod
def process(documents: Union[dict, Document, List[Union[dict, Document]]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[Document]
def process(documents: Union[dict, Document, List[Union[dict, Document]]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True, id_hash_keys: Optional[List[str]] = None) -> List[Document]
```

Perform document cleaning and splitting. Takes a single Document or a List of Documents as input and returns a
Expand Down
20 changes: 16 additions & 4 deletions haystack/nodes/file_converter/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ def run( # type: ignore
known_ligatures: Dict[str, str] = KNOWN_LIGATURES,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "UTF-8",
id_hash_keys: Optional[List[str]] = None,
):
"""
Extract text from a file.
Expand All @@ -162,6 +163,10 @@ def run( # type: ignore
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
:param encoding: Select the file encoding (default is `UTF-8`)
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
"""

if isinstance(file_paths, Path):
Expand All @@ -178,6 +183,7 @@ def run( # type: ignore
remove_numeric_tables=remove_numeric_tables,
valid_languages=valid_languages,
encoding=encoding,
id_hash_keys=id_hash_keys,
):
documents.append(doc)

Expand All @@ -192,14 +198,20 @@ def run( # type: ignore

def run_batch( # type: ignore
self,
file_paths: Union[Path, List[Path]], # type: ignore
meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, # type: ignore
remove_numeric_tables: Optional[bool] = None, # type: ignore
valid_languages: Optional[List[str]] = None, # type: ignore
file_paths: Union[Path, List[Path]],
meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,
remove_numeric_tables: Optional[bool] = None,
known_ligatures: Dict[str, str] = KNOWN_LIGATURES,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "UTF-8",
id_hash_keys: Optional[List[str]] = None,
):
return self.run(
file_paths=file_paths,
meta=meta,
remove_numeric_tables=remove_numeric_tables,
known_ligatures=known_ligatures,
valid_languages=valid_languages,
encoding=encoding,
id_hash_keys=id_hash_keys,
)
5 changes: 5 additions & 0 deletions haystack/nodes/preprocessor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def process(
split_length: Optional[int] = 1000,
split_overlap: Optional[int] = None,
split_respect_sentence_boundary: Optional[bool] = True,
id_hash_keys: Optional[List[str]] = None,
) -> List[Document]:
"""
Perform document cleaning and splitting. Takes a single Document or a List of Documents as input and returns a
Expand Down Expand Up @@ -59,6 +60,7 @@ def run( # type: ignore
split_length: Optional[int] = None,
split_overlap: Optional[int] = None,
split_respect_sentence_boundary: Optional[bool] = None,
id_hash_keys: Optional[List[str]] = None,
):
processed_documents = self.process(
documents=documents,
Expand All @@ -69,6 +71,7 @@ def run( # type: ignore
split_length=split_length,
split_overlap=split_overlap,
split_respect_sentence_boundary=split_respect_sentence_boundary,
id_hash_keys=id_hash_keys,
)
result = {"documents": processed_documents}
return result, "output_1"
Expand All @@ -83,6 +86,7 @@ def run_batch( # type: ignore
split_length: Optional[int] = None,
split_overlap: Optional[int] = None,
split_respect_sentence_boundary: Optional[bool] = None,
id_hash_keys: Optional[List[str]] = None,
):
return self.run(
documents=documents,
Expand All @@ -93,4 +97,5 @@ def run_batch( # type: ignore
split_length=split_length,
split_overlap=split_overlap,
split_respect_sentence_boundary=split_respect_sentence_boundary,
id_hash_keys=id_hash_keys,
)
16 changes: 16 additions & 0 deletions test/nodes/test_file_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
TikaConverter,
AzureConverter,
ParsrConverter,
TextConverter,
)

from ..conftest import SAMPLES_PATH
Expand Down Expand Up @@ -172,3 +173,18 @@ def test_parsr_converter():
assert docs[1].content_type == "text"
assert docs[1].content.startswith("A sample PDF file")
assert docs[1].content.endswith("Page 4 of Sample PDF\n… the page 3 is empty.")


def test_id_hash_keys_from_pipeline_params():
doc_path = SAMPLES_PATH / "docs" / "doc_1.txt"
meta_1 = {"key": "a"}
meta_2 = {"key": "b"}
meta = [meta_1, meta_2]

converter = TextConverter()
output, _ = converter.run(file_paths=[doc_path, doc_path], meta=meta, id_hash_keys=["content", "meta"])
documents = output["documents"]
unique_ids = set(d.id for d in documents)

assert len(documents) == 2
assert len(unique_ids) == 2
14 changes: 14 additions & 0 deletions test/nodes/test_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,17 @@ def test_remove_substrings():
assert "🪲" not in documents[0].content
assert "whitespace" in documents[0].content
assert "✨" in documents[0].content


def test_id_hash_keys_from_pipeline_params():
document_1 = Document(content="This is a document.", meta={"key": "a"})
document_2 = Document(content="This is a document.", meta={"key": "b"})
assert document_1.id == document_2.id

preprocessor = PreProcessor(split_length=2, split_respect_sentence_boundary=False)
output, _ = preprocessor.run(documents=[document_1, document_2], id_hash_keys=["content", "meta"])
documents = output["documents"]
unique_ids = set(d.id for d in documents)

assert len(documents) == 4
assert len(unique_ids) == 4

0 comments on commit 1168f63

Please sign in to comment.