Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: store id_hash_keys in Document objects to make documents clonable #3697

Merged
merged 26 commits into from
Jan 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
fc88d0d
store id_hash_keys in Document objects
ZanSara Dec 12, 2022
d86b7a4
fix id_hash_keys calls throughout codebase
ZanSara Dec 12, 2022
4cdd606
generate schema
ZanSara Dec 12, 2022
e1c3134
Merge branch 'main' into clonable-documents
ZanSara Dec 12, 2022
7bee159
fix es
ZanSara Dec 12, 2022
ca75cee
Merge branch 'main' into clonable-documents
ZanSara Dec 12, 2022
d581e8f
fix weaviate
ZanSara Dec 12, 2022
e14d731
backward compatible
ZanSara Dec 12, 2022
3da9a72
openapi schema
ZanSara Dec 12, 2022
61722a6
Merge branch 'main' into clonable-documents
ZanSara Jan 12, 2023
ebd4dd5
remove unused deprecation warning
ZanSara Jan 12, 2023
663e520
remove unused imports
ZanSara Jan 12, 2023
102e962
openapi
ZanSara Jan 12, 2023
9baa21e
unused var
ZanSara Jan 12, 2023
f80f86a
Merge branch 'main' into clonable-documents
ZanSara Jan 17, 2023
c31c1aa
Apply suggestions from code review
ZanSara Jan 17, 2023
240028c
Update haystack/schema.py
ZanSara Jan 17, 2023
2752c77
Apply suggestions from code review
ZanSara Jan 17, 2023
227ce56
Update haystack/schema.py
ZanSara Jan 17, 2023
429e393
review feedback
ZanSara Jan 17, 2023
d7092a7
Merge branch 'clonable-documents' of github.com:deepset-ai/haystack i…
ZanSara Jan 17, 2023
ea51207
trailing spaces
ZanSara Jan 17, 2023
f9672c0
pylint
ZanSara Jan 17, 2023
9dbf056
Merge branch 'main' into clonable-documents
ZanSara Jan 19, 2023
f80bd04
add deprecation test
ZanSara Jan 19, 2023
c15d58a
Merge branch 'main' into clonable-documents
ZanSara Jan 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/_src/api/openapi/openapi-1.12.0rc0.json
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,16 @@
"type": "object",
"default": {}
},
"id_hash_keys": {
"title": "Id Hash Keys",
"type": "array",
"items": {
"type": "string"
},
"default": [
"content"
]
},
"score": {
"title": "Score",
"type": "number"
Expand Down
1,046 changes: 1,046 additions & 0 deletions docs/_src/api/openapi/openapi-1.12.0rc2.json

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions docs/_src/api/openapi/openapi-1.13.0rc0.json
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,16 @@
"type": "object",
"default": {}
},
"id_hash_keys": {
"title": "Id Hash Keys",
"type": "array",
"items": {
"type": "string"
},
"default": [
"content"
]
},
"score": {
"title": "Score",
"type": "number"
Expand Down
10 changes: 10 additions & 0 deletions docs/_src/api/openapi/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,16 @@
"type": "object",
"default": {}
},
"id_hash_keys": {
"title": "Id Hash Keys",
"type": "array",
"items": {
"type": "string"
},
"default": [
"content"
]
},
"score": {
"title": "Score",
"type": "number"
Expand Down
11 changes: 7 additions & 4 deletions haystack/document_stores/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,10 +561,13 @@ def run( # type: ignore
"""

field_map = self._create_document_field_map()
doc_objects = [
Document.from_dict(d, field_map=field_map, id_hash_keys=id_hash_keys) if isinstance(d, dict) else d
for d in documents
]
doc_objects = []
for d in documents:
if isinstance(d, dict):
d["id_hash_keys"] = id_hash_keys
doc_objects.append(Document.from_dict(d, field_map=field_map))
else:
doc_objects.append(d)
self.write_documents(documents=doc_objects, index=index, headers=headers)
return {}, "output_1"

Expand Down
3 changes: 2 additions & 1 deletion haystack/document_stores/search_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1144,7 +1144,7 @@ def _convert_es_hit_to_document(
meta_data = {
k: v
for k, v in hit["_source"].items()
if k not in (self.content_field, "content_type", self.embedding_field)
if k not in (self.content_field, "content_type", "id_hash_keys", self.embedding_field)
}
name = meta_data.pop(self.name_field, None)
if name:
Expand Down Expand Up @@ -1173,6 +1173,7 @@ def _convert_es_hit_to_document(
"id": hit["_id"],
"content": hit["_source"].get(self.content_field),
"content_type": hit["_source"].get("content_type", None),
"id_hash_keys": hit["_source"].get("id_hash_keys", None),
"meta": meta_data,
"score": score,
"embedding": embedding,
Expand Down
13 changes: 12 additions & 1 deletion haystack/document_stores/weaviate.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,10 @@ def _convert_weaviate_result_to_document(
if props.get("content_type") is not None:
content_type = str(props.pop("content_type"))

id_hash_keys = None
if props.get("id_hash_keys") is not None:
id_hash_keys = props.pop("id_hash_keys")

# Weaviate creates "_additional" key for semantic search
if "_additional" in props:
if "certainty" in props["_additional"]:
Expand Down Expand Up @@ -293,7 +297,14 @@ def _convert_weaviate_result_to_document(
meta_data[k] = v

document = Document.from_dict(
{"id": id, "content": content, "content_type": content_type, "meta": meta_data, "score": score}
{
"id": id,
"content": content,
"content_type": content_type,
"meta": meta_data,
"score": score,
"id_hash_keys": id_hash_keys,
}
)

if return_embedding and embedding:
Expand Down
8 changes: 6 additions & 2 deletions haystack/nodes/connector/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,9 @@ def _write_to_files(
if base_url:
data["meta"]["base_url"] = base_url
data["content"] = text
document = Document.from_dict(data, id_hash_keys=id_hash_keys)
if id_hash_keys:
data["id_hash_keys"] = id_hash_keys
document = Document.from_dict(data)

if crawler_naming_function is not None:
file_name_prefix = crawler_naming_function(link, text)
Expand Down Expand Up @@ -382,7 +384,9 @@ def run( # type: ignore
crawled_data = []
for _file in file_paths:
with open(_file.absolute(), "r") as read_file:
crawled_data.append(Document.from_dict(json.load(read_file), id_hash_keys=id_hash_keys))
document = json.load(read_file)
document["id_hash_keys"] = id_hash_keys
crawled_data.append(Document.from_dict(document))
results = {"documents": crawled_data}
else:
results = {"paths": file_paths}
Expand Down
7 changes: 5 additions & 2 deletions haystack/nodes/preprocessor/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,8 @@ def clean(
id_hash_keys = self.id_hash_keys

if isinstance(document, dict):
document = Document.from_dict(document, id_hash_keys=id_hash_keys)
document["id_hash_keys"] = id_hash_keys
document = Document.from_dict(document)

# Mainly needed for type checking
if not isinstance(document, Document):
Expand Down Expand Up @@ -320,7 +321,9 @@ def split(
id_hash_keys = self.id_hash_keys

if isinstance(document, dict):
document = Document.from_dict(document, id_hash_keys=id_hash_keys)
document["id_hash_keys"] = id_hash_keys
document = Document.from_dict(document)

# Mainly needed for type checking
if not isinstance(document, Document):
raise HaystackError("Document must not be of type 'dict' but of type 'Document'.")
Expand Down
77 changes: 51 additions & 26 deletions haystack/schema.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations
import csv
import warnings
import hashlib
import inspect

Expand All @@ -16,7 +17,7 @@
import time
import json
import ast
from dataclasses import asdict, InitVar
from dataclasses import asdict

import mmh3
import numpy as np
Expand Down Expand Up @@ -46,9 +47,9 @@ class Document:
content: Union[str, pd.DataFrame]
content_type: ContentTypes = Field(default="text")
meta: Dict[str, Any] = Field(default={})
id_hash_keys: List[str] = Field(default=["content"])
score: Optional[float] = None
embedding: Optional[np.ndarray] = None
id_hash_keys: InitVar[Optional[List[str]]] = None

# We use a custom init here as we want some custom logic. The annotations above are however still needed in order
# to use some dataclass magic like "asdict()". See https://www.python.org/dev/peps/pep-0557/#custom-init-method
Expand Down Expand Up @@ -99,17 +100,25 @@ def __init__(
allowed_hash_key_attributes = ["content", "content_type", "score", "meta", "embedding"]

if id_hash_keys is not None:
if not set(id_hash_keys) <= set(allowed_hash_key_attributes): # type: ignore
if not set(id_hash_keys) <= set(allowed_hash_key_attributes):
raise ValueError(
f"You passed custom strings {id_hash_keys} to id_hash_keys which is deprecated. Supply instead a list of Document's attribute names that the id should be based on (e.g. {allowed_hash_key_attributes}). See https://github.com/deepset-ai/haystack/pull/1910 for details)"
f"You passed custom strings {id_hash_keys} to id_hash_keys which is deprecated. Supply instead a "
f"list of Document's attribute names (like {', '.join(allowed_hash_key_attributes)}). "
"See https://github.com/deepset-ai/haystack/pull/1910 for details)"
)
# We store id_hash_keys to be able to clone documents, for example when splitting them during pre-processing
self.id_hash_keys = id_hash_keys or ["content"]

if embedding is not None:
embedding = np.asarray(embedding)
self.embedding = embedding

# Create a unique ID (either new one, or one from user input)
if id is not None:
logger.info(
"Setting the ID manually. This might cause a mismatch with the ID "
"that would be generated from the document content and id_hash_keys value."
)
self.id: str = str(id)
else:
self.id: str = self._get_id(id_hash_keys=id_hash_keys)
Expand All @@ -131,28 +140,31 @@ def _get_id(self, id_hash_keys: Optional[List[str]] = None):

if final_hash_key == "":
raise ValueError(
f"Cant't create 'Document': 'id_hash_keys' must contain at least one of ['content', 'meta']"
"Can't create 'Document': 'id_hash_keys' must contain at least one of ['content', 'meta'] or be set to None."
)

return "{:02x}".format(mmh3.hash128(final_hash_key, signed=False))

def to_dict(self, field_map={}) -> Dict:
def to_dict(self, field_map: Optional[Dict[str, Any]] = None) -> Dict:
"""
Convert Document to dict. An optional field_map can be supplied to change the names of the keys in the
resulting dict. This way you can work with standardized Document objects in Haystack, but adjust the format that
they are serialized / stored in other places (e.g. elasticsearch)
Example:

```python
doc = Document(content="some text", content_type="text")
doc.to_dict(field_map={"custom_content_field": "content"})
doc = Document(content="some text", content_type="text")
doc.to_dict(field_map={"custom_content_field": "content"})

# Returns {"custom_content_field": "some text", "content_type": "text"}
# Returns {"custom_content_field": "some text", content_type": "text"}
```

:param field_map: Dict with keys being the custom target keys and values being the standard Document attributes
:return: dict with content of the Document
"""
if not field_map:
field_map = {}

inv_field_map = {v: k for k, v in field_map.items()}
_doc: Dict[str, str] = {}
for k, v in self.__dict__.items():
Expand All @@ -169,25 +181,37 @@ def to_dict(self, field_map={}) -> Dict:

@classmethod
def from_dict(
cls, dict: Dict[str, Any], field_map: Dict[str, Any] = {}, id_hash_keys: Optional[List[str]] = None
cls, dict: Dict[str, Any], field_map: Optional[Dict[str, Any]] = None, id_hash_keys: Optional[List[str]] = None
) -> Document:
"""
Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the
Create Document from dict. An optional `field_map` parameter can be supplied to adjust for custom names of the keys in the
input dict. This way you can work with standardized Document objects in Haystack, but adjust the format that
they are serialized / stored in other places (e.g. elasticsearch)
they are serialized / stored in other places (e.g. elasticsearch).

Example:

```python
my_dict = {"custom_content_field": "some text", content_type": "text"}
Document.from_dict(my_dict, field_map={"custom_content_field": "content"})
my_dict = {"custom_content_field": "some text", "content_type": "text"}
Document.from_dict(my_dict, field_map={"custom_content_field": "content"})
```

:param field_map: Dict with keys being the custom target keys and values being the standard Document attributes
:return: dict with content of the Document
:return: A Document object
"""
if not field_map:
field_map = {}
if id_hash_keys:
warnings.warn(
message="Passing id_hash_keys directly is deprecated: Document objects now store such information internally.\n"
"Old API: Document.from_dict({'content': 'test', 'meta': {'some': 'value'}}, id_hash_keys=['meta'])\n"
"New API: Document.from_dict({'content': 'test', 'meta': {'some': 'value'}, 'id_hash_keys': ['meta']})\n",
category=DeprecationWarning,
stacklevel=2,
)
dict["id_hash_keys"] = id_hash_keys

_doc = dict.copy()
init_args = ["content", "content_type", "id", "score", "question", "meta", "embedding"]
init_args = ["content", "content_type", "id", "score", "id_hash_keys", "question", "meta", "embedding"]
if "meta" not in _doc.keys():
_doc["meta"] = {}
# copy additional fields into "meta"
Expand All @@ -206,31 +230,32 @@ def from_dict(
k = field_map[k]
_new_doc[k] = v

if _doc.get("id") is None:
_new_doc["id_hash_keys"] = id_hash_keys

# Convert list of rows to pd.DataFrame
if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list):
_new_doc["content"] = pd.DataFrame(columns=_new_doc["content"][0], data=_new_doc["content"][1:])

return cls(**_new_doc)

def to_json(self, field_map={}) -> str:
d = self.to_dict(field_map=field_map)
j = json.dumps(d, cls=NumpyEncoder)
return j
def to_json(self, field_map: Optional[Dict[str, Any]] = None) -> str:
if not field_map:
field_map = {}
dictionary = self.to_dict(field_map=field_map)
return json.dumps(dictionary, cls=NumpyEncoder)

@classmethod
def from_json(cls, data: str, field_map={}):
d = json.loads(data)
return cls.from_dict(d, field_map=field_map)
def from_json(cls, data: str, field_map: Optional[Dict[str, Any]] = None) -> Document:
if not field_map:
field_map = {}
dictionary = json.loads(data)
return cls.from_dict(dictionary, field_map=field_map)

def __eq__(self, other):
return (
isinstance(other, self.__class__)
and getattr(other, "content", None) == self.content
and getattr(other, "content_type", None) == self.content_type
and getattr(other, "id", None) == self.id
and getattr(other, "id_hash_keys", None) == self.id_hash_keys
and getattr(other, "score", None) == self.score
and getattr(other, "meta", None) == self.meta
and np.array_equal(getattr(other, "embedding", None), self.embedding)
Expand Down
32 changes: 30 additions & 2 deletions test/others/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np
import pandas as pd

from ..conftest import SAMPLES_PATH
from ..conftest import SAMPLES_PATH, fail_at_version

LABELS = [
Label(
Expand Down Expand Up @@ -45,6 +45,26 @@
]


def test_document_from_dict():
doc = Document(
content="this is the content of the document", meta={"some": "meta"}, id_hash_keys=["content", "meta"]
)
assert doc == Document.from_dict(doc.to_dict())


@fail_at_version(1, 15)
def test_deprecated_id_hash_keys_in_document_from_dict():
doc = Document(
content="this is the content of the document", meta={"some": "meta"}, id_hash_keys=["content", "meta"]
)
# id_hash_keys in Document.from_dict() is deprecated and should be removed.
with pytest.warns(DeprecationWarning):
assert doc == Document.from_dict(
{"content": "this is the content of the document", "meta": {"some": "meta"}},
id_hash_keys=["content", "meta"],
)


def test_no_answer_label():
labels = [
Label(
Expand Down Expand Up @@ -152,6 +172,7 @@ def test_doc_to_json():
d = Document(
content="some text",
content_type="text",
id_hash_keys=["meta"],
score=0.99988,
meta={"name": "doc1"},
embedding=np.random.rand(768).astype(np.float32),
Expand All @@ -161,7 +182,14 @@ def test_doc_to_json():
assert d == d_new

# No embedding
d = Document(content="some text", content_type="text", score=0.99988, meta={"name": "doc1"}, embedding=None)
d = Document(
content="some text",
content_type="text",
score=0.99988,
meta={"name": "doc1"},
id_hash_keys=["meta"],
embedding=None,
)
j0 = d.to_json()
d_new = Document.from_json(j0)
assert d == d_new
Expand Down