deepset-ai · ZanSara · Nov 8, 2022 · Nov 9, 2022 · Nov 10, 2022 · Nov 10, 2022
@@ -661,6 +661,13 @@
                     "embedding": {
                         "title": "Embedding",
                         "type": "string"
+                    },
+                    "id_hash_keys": {
+                        "title": "Id Hash Keys",
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
                     }
                 }
             },

@@ -661,6 +661,13 @@
                     "embedding": {
                         "title": "Embedding",
                         "type": "string"
+                    },
+                    "id_hash_keys": {
+                        "title": "Id Hash Keys",
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
                     }
                 }
             },

@@ -442,7 +442,7 @@ def add_eval_data(
         if preprocessor is not None:
             assert preprocessor.split_by != "sentence", (
                 f"Split by sentence not supported.\n"
-                f"Please set 'split_by' to either 'word' or 'passage' in the supplied PreProcessor."
+                f"Please set 'split_by' to either 'word', 'paragraph', or 'page' in the supplied PreProcessor."
             )
             assert preprocessor.split_respect_sentence_boundary == False, (
                 f"split_respect_sentence_boundary not supported yet.\n"

@@ -6,7 +6,7 @@
 from haystack.schema import Document
 from haystack.document_stores.base import BaseDocumentStore
 from haystack.document_stores.filter_utils import LogicalFilterClause
-from haystack.nodes.preprocessor.preprocessor import PreProcessor
+from haystack.nodes.preprocessor.preprocessor_old import PreProcessor
 
 
 def open_search_index_to_document_store(

@@ -149,7 +149,7 @@ def _extract_docs_and_labels_from_dict(
         ## Create Document
         cur_full_doc = Document(content=paragraph["context"], meta=cur_meta)
         if preprocessor is not None:
-            splits_docs = preprocessor.process(documents=[cur_full_doc])
+            splits_docs = preprocessor.run(documents=[cur_full_doc])[0]["documents"]
             # we need to pull in _split_id into the document id for unique reference in labels
             splits: List[Document] = []
             offset = 0

@@ -256,6 +256,10 @@ def _convert_weaviate_result_to_document(
         if props.get("content_type") is not None:
             content_type = str(props.pop("content_type"))
 
+        id_hash_keys = None
+        if props.get("id_hash_keys") is not None:
+            id_hash_keys = str(props.pop("id_hash_keys"))
+
         # Weaviate creates "_additional" key for semantic search
         if "_additional" in props:
             if "certainty" in props["_additional"]:
@@ -288,7 +292,14 @@ def _convert_weaviate_result_to_document(
             meta_data[k] = v
 
         document = Document.from_dict(
-            {"id": id, "content": content, "content_type": content_type, "meta": meta_data, "score": score}
+            {
+                "id": id,
+                "content": content,
+                "content_type": content_type,
+                "meta": meta_data,
+                "id_hash_keys": id_hash_keys,
+                "score": score,
+            }
         )
 
         if return_embedding and embedding:

@@ -21,8 +21,14 @@
     ParsrConverter,
 )
 from haystack.nodes.label_generator import PseudoLabelGenerator
-from haystack.nodes.other import Docs2Answers, JoinDocuments, RouteDocuments, JoinAnswers, DocumentMerger
-from haystack.nodes.preprocessor import BasePreProcessor, PreProcessor
+from haystack.nodes.other import Docs2Answers, JoinDocuments, RouteDocuments, JoinAnswers
+from haystack.nodes.preprocessor import (
+    PreProcessor,
+    DocumentMerger,
+    DocumentSplitter,
+    DocumentCleaner,
+    DocumentPreProcessor,
+)
 from haystack.nodes.query_classifier import SklearnQueryClassifier, TransformersQueryClassifier
 from haystack.nodes.question_generator import QuestionGenerator
 from haystack.nodes.ranker import BaseRanker, SentenceTransformersRanker

@@ -3,4 +3,3 @@
 from haystack.nodes.other.route_documents import RouteDocuments
 from haystack.nodes.other.join_answers import JoinAnswers
 from haystack.nodes.other.join import JoinNode
-from haystack.nodes.other.document_merger import DocumentMerger
@@ -1,2 +1,5 @@
-from haystack.nodes.preprocessor.base import BasePreProcessor
-from haystack.nodes.preprocessor.preprocessor import PreProcessor
+from haystack.nodes.preprocessor.preprocessor_old import PreProcessor
+from haystack.nodes.preprocessor.preprocessor_new import DocumentPreProcessor
+from haystack.nodes.preprocessor.splitter import DocumentSplitter
+from haystack.nodes.preprocessor.cleaner import DocumentCleaner
+from haystack.nodes.preprocessor.merger import DocumentMerger