deepset-ai · ZanSara · Jan 23, 2023 · Dec 12, 2022 · Dec 12, 2022 · Dec 12, 2022
@@ -659,6 +659,16 @@
                         "type": "object",
                         "default": {}
                     },
+                    "id_hash_keys": {
+                        "title": "Id Hash Keys",
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        },
+                        "default": [
+                            "content"
+                        ]
+                    },
                     "score": {
                         "title": "Score",
                         "type": "number"

@@ -659,6 +659,16 @@
                         "type": "object",
                         "default": {}
                     },
+                    "id_hash_keys": {
+                        "title": "Id Hash Keys",
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        },
+                        "default": [
+                            "content"
+                        ]
+                    },
                     "score": {
                         "title": "Score",
                         "type": "number"

@@ -659,6 +659,16 @@
                         "type": "object",
                         "default": {}
                     },
+                    "id_hash_keys": {
+                        "title": "Id Hash Keys",
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        },
+                        "default": [
+                            "content"
+                        ]
+                    },
                     "score": {
                         "title": "Score",
                         "type": "number"

@@ -561,10 +561,13 @@ def run(  # type: ignore
         """
 
         field_map = self._create_document_field_map()
-        doc_objects = [
-            Document.from_dict(d, field_map=field_map, id_hash_keys=id_hash_keys) if isinstance(d, dict) else d
-            for d in documents
-        ]
+        doc_objects = []
+        for d in documents:
+            if isinstance(d, dict):
+                d["id_hash_keys"] = id_hash_keys
+                doc_objects.append(Document.from_dict(d, field_map=field_map))
+            else:
+                doc_objects.append(d)
         self.write_documents(documents=doc_objects, index=index, headers=headers)
         return {}, "output_1"
 

@@ -1144,7 +1144,7 @@ def _convert_es_hit_to_document(
             meta_data = {
                 k: v
                 for k, v in hit["_source"].items()
-                if k not in (self.content_field, "content_type", self.embedding_field)
+                if k not in (self.content_field, "content_type", "id_hash_keys", self.embedding_field)
             }
             name = meta_data.pop(self.name_field, None)
             if name:
@@ -1173,6 +1173,7 @@ def _convert_es_hit_to_document(
                 "id": hit["_id"],
                 "content": hit["_source"].get(self.content_field),
                 "content_type": hit["_source"].get("content_type", None),
+                "id_hash_keys": hit["_source"].get("id_hash_keys", None),
                 "meta": meta_data,
                 "score": score,
                 "embedding": embedding,

@@ -261,6 +261,10 @@ def _convert_weaviate_result_to_document(
         if props.get("content_type") is not None:
             content_type = str(props.pop("content_type"))
 
+        id_hash_keys = None
+        if props.get("id_hash_keys") is not None:
+            id_hash_keys = props.pop("id_hash_keys")
+
         # Weaviate creates "_additional" key for semantic search
         if "_additional" in props:
             if "certainty" in props["_additional"]:
@@ -293,7 +297,14 @@ def _convert_weaviate_result_to_document(
             meta_data[k] = v
 
         document = Document.from_dict(
-            {"id": id, "content": content, "content_type": content_type, "meta": meta_data, "score": score}
+            {
+                "id": id,
+                "content": content,
+                "content_type": content_type,
+                "meta": meta_data,
+                "score": score,
+                "id_hash_keys": id_hash_keys,
+            }
         )
 
         if return_embedding and embedding:

@@ -295,7 +295,9 @@ def _write_to_files(
             if base_url:
                 data["meta"]["base_url"] = base_url
             data["content"] = text
-            document = Document.from_dict(data, id_hash_keys=id_hash_keys)
+            if id_hash_keys:
+                data["id_hash_keys"] = id_hash_keys
+            document = Document.from_dict(data)
 
             if crawler_naming_function is not None:
                 file_name_prefix = crawler_naming_function(link, text)
@@ -382,7 +384,9 @@ def run(  # type: ignore
             crawled_data = []
             for _file in file_paths:
                 with open(_file.absolute(), "r") as read_file:
-                    crawled_data.append(Document.from_dict(json.load(read_file), id_hash_keys=id_hash_keys))
+                    document = json.load(read_file)
+                    document["id_hash_keys"] = id_hash_keys
+                    crawled_data.append(Document.from_dict(document))
             results = {"documents": crawled_data}
         else:
             results = {"paths": file_paths}

@@ -269,7 +269,8 @@ def clean(
             id_hash_keys = self.id_hash_keys
 
         if isinstance(document, dict):
-            document = Document.from_dict(document, id_hash_keys=id_hash_keys)
+            document["id_hash_keys"] = id_hash_keys
+            document = Document.from_dict(document)
 
         # Mainly needed for type checking
         if not isinstance(document, Document):
@@ -320,7 +321,9 @@ def split(
             id_hash_keys = self.id_hash_keys
 
         if isinstance(document, dict):
-            document = Document.from_dict(document, id_hash_keys=id_hash_keys)
+            document["id_hash_keys"] = id_hash_keys
+            document = Document.from_dict(document)
+
         # Mainly needed for type checking
         if not isinstance(document, Document):
             raise HaystackError("Document must not be of type 'dict' but of type 'Document'.")

@@ -1,5 +1,6 @@
 from __future__ import annotations
 import csv
+import warnings
 import hashlib
 import inspect
 
@@ -16,7 +17,7 @@
 import time
 import json
 import ast
-from dataclasses import asdict, InitVar
+from dataclasses import asdict
 
 import mmh3
 import numpy as np
@@ -46,9 +47,9 @@ class Document:
     content: Union[str, pd.DataFrame]
     content_type: ContentTypes = Field(default="text")
     meta: Dict[str, Any] = Field(default={})
+    id_hash_keys: List[str] = Field(default=["content"])
     score: Optional[float] = None
     embedding: Optional[np.ndarray] = None
-    id_hash_keys: InitVar[Optional[List[str]]] = None
 
     # We use a custom init here as we want some custom logic. The annotations above are however still needed in order
     # to use some dataclass magic like "asdict()". See https://www.python.org/dev/peps/pep-0557/#custom-init-method
@@ -99,17 +100,25 @@ def __init__(
         allowed_hash_key_attributes = ["content", "content_type", "score", "meta", "embedding"]
 
         if id_hash_keys is not None:
-            if not set(id_hash_keys) <= set(allowed_hash_key_attributes):  # type: ignore
+            if not set(id_hash_keys) <= set(allowed_hash_key_attributes):
                 raise ValueError(
-                    f"You passed custom strings {id_hash_keys} to id_hash_keys which is deprecated. Supply instead a list of Document's attribute names that the id should be based on (e.g. {allowed_hash_key_attributes}). See https://github.com/deepset-ai/haystack/pull/1910 for details)"
+                    f"You passed custom strings {id_hash_keys} to id_hash_keys which is deprecated. Supply instead a "
+                    f"list of Document's attribute names (like {', '.join(allowed_hash_key_attributes)}). "
+                    "See https://github.com/deepset-ai/haystack/pull/1910 for details)"
                 )
+        # We store id_hash_keys to be able to clone documents, for example when splitting them during pre-processing
+        self.id_hash_keys = id_hash_keys or ["content"]
 
         if embedding is not None:
             embedding = np.asarray(embedding)
         self.embedding = embedding
 
         # Create a unique ID (either new one, or one from user input)
         if id is not None:
+            logger.info(
+                "Setting the ID manually. This might cause a mismatch with the ID "
+                "that would be generated from the document content and id_hash_keys value."
+            )
             self.id: str = str(id)
         else:
             self.id: str = self._get_id(id_hash_keys=id_hash_keys)
@@ -131,28 +140,31 @@ def _get_id(self, id_hash_keys: Optional[List[str]] = None):
 
         if final_hash_key == "":
             raise ValueError(
-                f"Cant't create 'Document': 'id_hash_keys' must contain at least one of ['content', 'meta']"
+                "Can't create 'Document': 'id_hash_keys' must contain at least one of ['content', 'meta'] or be set to None."
             )
 
         return "{:02x}".format(mmh3.hash128(final_hash_key, signed=False))
 
-    def to_dict(self, field_map={}) -> Dict:
+    def to_dict(self, field_map: Optional[Dict[str, Any]] = None) -> Dict:
         """
         Convert Document to dict. An optional field_map can be supplied to change the names of the keys in the
         resulting dict. This way you can work with standardized Document objects in Haystack, but adjust the format that
         they are serialized / stored in other places (e.g. elasticsearch)
         Example:
 
         ```python
-        doc = Document(content="some text", content_type="text")
-        doc.to_dict(field_map={"custom_content_field": "content"})
+            doc = Document(content="some text", content_type="text")
+            doc.to_dict(field_map={"custom_content_field": "content"})
 
-        # Returns {"custom_content_field": "some text", "content_type": "text"}
+            # Returns {"custom_content_field": "some text", content_type": "text"}
         ```
 
         :param field_map: Dict with keys being the custom target keys and values being the standard Document attributes
         :return: dict with content of the Document
         """
+        if not field_map:
+            field_map = {}
+
         inv_field_map = {v: k for k, v in field_map.items()}
         _doc: Dict[str, str] = {}
         for k, v in self.__dict__.items():
@@ -169,25 +181,37 @@ def to_dict(self, field_map={}) -> Dict:
 
     @classmethod
     def from_dict(
-        cls, dict: Dict[str, Any], field_map: Dict[str, Any] = {}, id_hash_keys: Optional[List[str]] = None
+        cls, dict: Dict[str, Any], field_map: Optional[Dict[str, Any]] = None, id_hash_keys: Optional[List[str]] = None
     ) -> Document:
         """
-        Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the
+        Create Document from dict. An optional `field_map` parameter can be supplied to adjust for custom names of the keys in the
         input dict. This way you can work with standardized Document objects in Haystack, but adjust the format that
-        they are serialized / stored in other places (e.g. elasticsearch)
+        they are serialized / stored in other places (e.g. elasticsearch).
+
         Example:
 
         ```python
-        my_dict = {"custom_content_field": "some text", content_type": "text"}
-        Document.from_dict(my_dict, field_map={"custom_content_field": "content"})
+            my_dict = {"custom_content_field": "some text", "content_type": "text"}
+            Document.from_dict(my_dict, field_map={"custom_content_field": "content"})
         ```
 
         :param field_map: Dict with keys being the custom target keys and values being the standard Document attributes
-        :return: dict with content of the Document
+        :return: A Document object
         """
+        if not field_map:
+            field_map = {}
+        if id_hash_keys:
+            warnings.warn(
+                message="Passing id_hash_keys directly is deprecated: Document objects now store such information internally.\n"
+                "Old API: Document.from_dict({'content': 'test', 'meta': {'some': 'value'}}, id_hash_keys=['meta'])\n"
+                "New API: Document.from_dict({'content': 'test', 'meta': {'some': 'value'}, 'id_hash_keys': ['meta']})\n",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+            dict["id_hash_keys"] = id_hash_keys
 
         _doc = dict.copy()
-        init_args = ["content", "content_type", "id", "score", "question", "meta", "embedding"]
+        init_args = ["content", "content_type", "id", "score", "id_hash_keys", "question", "meta", "embedding"]
         if "meta" not in _doc.keys():
             _doc["meta"] = {}
         # copy additional fields into "meta"
@@ -206,31 +230,32 @@ def from_dict(
                 k = field_map[k]
                 _new_doc[k] = v
 
-        if _doc.get("id") is None:
-            _new_doc["id_hash_keys"] = id_hash_keys
-
         # Convert list of rows to pd.DataFrame
         if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list):
             _new_doc["content"] = pd.DataFrame(columns=_new_doc["content"][0], data=_new_doc["content"][1:])
 
         return cls(**_new_doc)
 
-    def to_json(self, field_map={}) -> str:
-        d = self.to_dict(field_map=field_map)
-        j = json.dumps(d, cls=NumpyEncoder)
-        return j
+    def to_json(self, field_map: Optional[Dict[str, Any]] = None) -> str:
+        if not field_map:
+            field_map = {}
+        dictionary = self.to_dict(field_map=field_map)
+        return json.dumps(dictionary, cls=NumpyEncoder)
 
     @classmethod
-    def from_json(cls, data: str, field_map={}):
-        d = json.loads(data)
-        return cls.from_dict(d, field_map=field_map)
+    def from_json(cls, data: str, field_map: Optional[Dict[str, Any]] = None) -> Document:
+        if not field_map:
+            field_map = {}
+        dictionary = json.loads(data)
+        return cls.from_dict(dictionary, field_map=field_map)
 
     def __eq__(self, other):
         return (
             isinstance(other, self.__class__)
             and getattr(other, "content", None) == self.content
             and getattr(other, "content_type", None) == self.content_type
             and getattr(other, "id", None) == self.id
+            and getattr(other, "id_hash_keys", None) == self.id_hash_keys
             and getattr(other, "score", None) == self.score
             and getattr(other, "meta", None) == self.meta
             and np.array_equal(getattr(other, "embedding", None), self.embedding)

@@ -3,7 +3,7 @@
 import numpy as np
 import pandas as pd
 
-from ..conftest import SAMPLES_PATH
+from ..conftest import SAMPLES_PATH, fail_at_version
 
 LABELS = [
     Label(
@@ -45,6 +45,26 @@
 ]
 
 
+def test_document_from_dict():
+    doc = Document(
+        content="this is the content of the document", meta={"some": "meta"}, id_hash_keys=["content", "meta"]
+    )
+    assert doc == Document.from_dict(doc.to_dict())
+
+
+@fail_at_version(1, 15)
+def test_deprecated_id_hash_keys_in_document_from_dict():
+    doc = Document(
+        content="this is the content of the document", meta={"some": "meta"}, id_hash_keys=["content", "meta"]
+    )
+    # id_hash_keys in Document.from_dict() is deprecated and should be removed.
+    with pytest.warns(DeprecationWarning):
+        assert doc == Document.from_dict(
+            {"content": "this is the content of the document", "meta": {"some": "meta"}},
+            id_hash_keys=["content", "meta"],
+        )
+
+
 def test_no_answer_label():
     labels = [
         Label(
@@ -152,6 +172,7 @@ def test_doc_to_json():
     d = Document(
         content="some text",
         content_type="text",
+        id_hash_keys=["meta"],
         score=0.99988,
         meta={"name": "doc1"},
         embedding=np.random.rand(768).astype(np.float32),
@@ -161,7 +182,14 @@ def test_doc_to_json():
     assert d == d_new
 
     # No embedding
-    d = Document(content="some text", content_type="text", score=0.99988, meta={"name": "doc1"}, embedding=None)
+    d = Document(
+        content="some text",
+        content_type="text",
+        score=0.99988,
+        meta={"name": "doc1"},
+        id_hash_keys=["meta"],
+        embedding=None,
+    )
     j0 = d.to_json()
     d_new = Document.from_json(j0)
     assert d == d_new