From 929c685cdad93a7315983f7f01d77e57a4235741 Mon Sep 17 00:00:00 2001 From: Sara Zan Date: Thu, 14 Apr 2022 16:42:02 +0200 Subject: [PATCH] Forbid usage of `*args` and `**kwargs` in any node's `__init__` (#2362) * Add failing test * Remove `**kwargs` from docstores' `__init__` functions (#2407) * Remove kwargs from ESDocStore subclasses * Remove kwargs from subclasses of SQLDocumentStore * Remove kwargs from Weaviate * Revert change in pinecone * Fix tests * Fix retriever test wirh weaviate * Change Exception into DocumentStoreError * Update Documentation & Code Style * Remove `**kwargs` from `FARMReader` (#2413) * Remove FARMReader kwargs without trying to replace them functionally * Update Documentation & Code Style * enforce same index values before and after saving/loading eval dataframes (#2398) * Add tests for missing `__init__` and `super().__init__()` in custom nodes (#2350) * Add tests for missing init and super * Update Documentation & Code Style * change in with endswith * Move test in pipeline.py and change test in pipeline_yaml.py * Update Documentation & Code Style * Use caplog to test the warning * Update Documentation & Code Style * move tests into test_pipeline and use get_config * Update Documentation & Code Style * Unmock version name * Improve variadic args test * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/_src/api/api/document_store.md | 13 +- docs/_src/api/api/reader.md | 2 +- haystack/document_stores/elasticsearch.py | 161 +- haystack/document_stores/faiss.py | 34 +- haystack/document_stores/milvus1.py | 9 +- haystack/document_stores/milvus2.py | 8 +- haystack/document_stores/pinecone.py | 3 - haystack/document_stores/weaviate.py | 1 - .../haystack-pipeline-1.2.1rc0.schema.json | 3 - .../haystack-pipeline-1.3.1rc0.schema.json | 4191 +++++++++++++++++ .../haystack-pipeline-unstable.schema.json | 395 +- .../haystack-pipeline.schema.json | 19 +- haystack/nodes/_json_schema.py | 7 + haystack/nodes/reader/farm.py | 2 - test/conftest.py | 4 +- test/test_pipeline_yaml.py | 66 + test/test_retriever.py | 4 +- 17 files changed, 4848 insertions(+), 74 deletions(-) create mode 100644 haystack/json-schemas/haystack-pipeline-1.3.1rc0.schema.json diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index d46333f696..337d0bc971 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -414,7 +414,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore) #### \_\_init\_\_ ```python -def __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = "overwrite", index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym", use_system_proxy: bool = False) +def __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity: str = "dot_product", timeout: int = 30, return_embedding: bool = False, duplicate_documents: str = "overwrite", index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym", use_system_proxy: bool = False) ``` A DocumentStore using Elasticsearch to store and query the documents for our search. @@ -1231,7 +1231,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore) #### \_\_init\_\_ ```python -def __init__(verify_certs=False, scheme="https", username="admin", password="admin", port=9200, **kwargs) +def __init__(scheme: str = "https", username: str = "admin", password: str = "admin", host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", ca_certs: Optional[str] = None, verify_certs: bool = False, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity: str = "dot_product", timeout: int = 30, return_embedding: bool = False, duplicate_documents: str = "overwrite", index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym", use_system_proxy: bool = False) ``` Document Store using OpenSearch (https://opensearch.org/). It is compatible with the AWS Elasticsearch Service. @@ -2235,7 +2235,7 @@ the vector embeddings are indexed in a FAISS Index. #### \_\_init\_\_ ```python -def __init__(sql_url: str = "sqlite:///faiss_document_store.db", vector_dim: int = None, embedding_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", faiss_index_path: Union[str, Path] = None, faiss_config_path: Union[str, Path] = None, isolation_level: str = None, **kwargs, ,) +def __init__(sql_url: str = "sqlite:///faiss_document_store.db", vector_dim: int = None, embedding_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", faiss_index_path: Union[str, Path] = None, faiss_config_path: Union[str, Path] = None, isolation_level: str = None, n_links: int = 64, ef_search: int = 20, ef_construction: int = 80) ``` **Arguments**: @@ -2282,6 +2282,9 @@ If specified no other params besides faiss_config_path must be specified. - `faiss_config_path`: Stored FAISS initial configuration parameters. Can be created via calling `save()` - `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) +- `n_links`: used only if index_factory == "HNSW" +- `ef_search`: used only if index_factory == "HNSW" +- `ef_construction`: used only if index_factory == "HNSW" @@ -2545,7 +2548,7 @@ Usage: #### \_\_init\_\_ ```python -def __init__(sql_url: str = "sqlite:///", milvus_url: str = "tcp://localhost:19530", connection_pool: str = "SingletonThread", index: str = "document", vector_dim: int = None, embedding_dim: int = 768, index_file_size: int = 1024, similarity: str = "dot_product", index_type: IndexType = IndexType.FLAT, index_param: Optional[Dict[str, Any]] = None, search_param: Optional[Dict[str, Any]] = None, return_embedding: bool = False, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", isolation_level: str = None, **kwargs, ,) +def __init__(sql_url: str = "sqlite:///", milvus_url: str = "tcp://localhost:19530", connection_pool: str = "SingletonThread", index: str = "document", vector_dim: int = None, embedding_dim: int = 768, index_file_size: int = 1024, similarity: str = "dot_product", index_type: IndexType = IndexType.FLAT, index_param: Optional[Dict[str, Any]] = None, search_param: Optional[Dict[str, Any]] = None, return_embedding: bool = False, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", isolation_level: str = None) ``` **Arguments**: @@ -3168,7 +3171,7 @@ The current implementation is not supporting the storage of labels, so you canno #### \_\_init\_\_ ```python -def __init__(host: Union[str, List[str]] = "http://localhost", port: Union[int, List[int]] = 8080, timeout_config: tuple = (5, 15), username: str = None, password: str = None, index: str = "Document", embedding_dim: int = 768, content_field: str = "content", name_field: str = "name", similarity: str = "cosine", index_type: str = "hnsw", custom_schema: Optional[dict] = None, return_embedding: bool = False, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", **kwargs, ,) +def __init__(host: Union[str, List[str]] = "http://localhost", port: Union[int, List[int]] = 8080, timeout_config: tuple = (5, 15), username: str = None, password: str = None, index: str = "Document", embedding_dim: int = 768, content_field: str = "content", name_field: str = "name", similarity: str = "cosine", index_type: str = "hnsw", custom_schema: Optional[dict] = None, return_embedding: bool = False, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite") ``` **Arguments**: diff --git a/docs/_src/api/api/reader.md b/docs/_src/api/api/reader.md index 65a2d6a4ac..e030e5e852 100644 --- a/docs/_src/api/api/reader.md +++ b/docs/_src/api/api/reader.md @@ -55,7 +55,7 @@ While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interf #### \_\_init\_\_ ```python -def __init__(model_name_or_path: str, model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, devices: List[torch.device] = [], no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0, use_confidence_scores: bool = True, confidence_threshold: Optional[float] = None, proxies: Optional[Dict[str, str]] = None, local_files_only=False, force_download=False, use_auth_token: Optional[Union[str, bool]] = None, **kwargs, ,) +def __init__(model_name_or_path: str, model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, devices: List[torch.device] = [], no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0, use_confidence_scores: bool = True, confidence_threshold: Optional[float] = None, proxies: Optional[Dict[str, str]] = None, local_files_only=False, force_download=False, use_auth_token: Optional[Union[str, bool]] = None) ``` **Arguments**: diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index 13cbf24f83..8786207820 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -23,6 +23,7 @@ from haystack.schema import Document, Label from haystack.document_stores.base import get_batches_from_generator from haystack.document_stores.filter_utils import LogicalFilterClause +from haystack.errors import DocumentStoreError logger = logging.getLogger(__name__) @@ -54,8 +55,8 @@ def __init__( recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", - similarity="dot_product", - timeout=30, + similarity: str = "dot_product", + timeout: int = 30, return_embedding: bool = False, duplicate_documents: str = "overwrite", index_type: str = "flat", @@ -179,9 +180,9 @@ def __init__( self.scroll = scroll self.skip_missing_embeddings: bool = skip_missing_embeddings if similarity in ["cosine", "dot_product", "l2"]: - self.similarity = similarity + self.similarity: str = similarity else: - raise Exception( + raise DocumentStoreError( f"Invalid value {similarity} for similarity in ElasticSearchDocumentStore constructor. Choose between 'cosine', 'l2' and 'dot_product'" ) if index_type in ["flat", "hnsw"]: @@ -1592,7 +1593,42 @@ def delete_index(self, index: str): class OpenSearchDocumentStore(ElasticsearchDocumentStore): - def __init__(self, verify_certs=False, scheme="https", username="admin", password="admin", port=9200, **kwargs): + def __init__( + self, + scheme: str = "https", # Mind this different default param + username: str = "admin", # Mind this different default param + password: str = "admin", # Mind this different default param + host: Union[str, List[str]] = "localhost", + port: Union[int, List[int]] = 9200, + api_key_id: Optional[str] = None, + api_key: Optional[str] = None, + aws4auth=None, + index: str = "document", + label_index: str = "label", + search_fields: Union[str, list] = "content", + content_field: str = "content", + name_field: str = "name", + embedding_field: str = "embedding", + embedding_dim: int = 768, + custom_mapping: Optional[dict] = None, + excluded_meta_data: Optional[list] = None, + analyzer: str = "standard", + ca_certs: Optional[str] = None, + verify_certs: bool = False, # Mind this different default param + recreate_index: bool = False, + create_index: bool = True, + refresh_type: str = "wait_for", + similarity: str = "dot_product", + timeout: int = 30, + return_embedding: bool = False, + duplicate_documents: str = "overwrite", + index_type: str = "flat", + scroll: str = "1d", + skip_missing_embeddings: bool = True, + synonyms: Optional[List] = None, + synonym_type: str = "synonym", + use_system_proxy: bool = False, + ): """ Document Store using OpenSearch (https://opensearch.org/). It is compatible with the AWS Elasticsearch Service. @@ -1662,14 +1698,44 @@ def __init__(self, verify_certs=False, scheme="https", username="admin", passwor Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process. More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html """ + super().__init__( + scheme=scheme, + username=username, + password=password, + host=host, + port=port, + api_key_id=api_key_id, + api_key=api_key, + aws4auth=aws4auth, + index=index, + label_index=label_index, + search_fields=search_fields, + content_field=content_field, + name_field=name_field, + embedding_field=embedding_field, + embedding_dim=embedding_dim, + custom_mapping=custom_mapping, + excluded_meta_data=excluded_meta_data, + analyzer=analyzer, + ca_certs=ca_certs, + verify_certs=verify_certs, + recreate_index=recreate_index, + create_index=create_index, + refresh_type=refresh_type, + similarity=similarity, + timeout=timeout, + return_embedding=return_embedding, + duplicate_documents=duplicate_documents, + index_type=index_type, + scroll=scroll, + skip_missing_embeddings=skip_missing_embeddings, + synonyms=synonyms, + synonym_type=synonym_type, + use_system_proxy=use_system_proxy, + ) self.embeddings_field_supports_similarity = False self.similarity_to_space_type = {"cosine": "cosinesimil", "dot_product": "innerproduct", "l2": "l2"} self.space_type_to_similarity = {v: k for k, v in self.similarity_to_space_type.items()} - # Overwrite default kwarg values of parent class so that in default cases we can initialize - # an OpenSearchDocumentStore without provding any arguments - super(OpenSearchDocumentStore, self).__init__( - verify_certs=verify_certs, scheme=scheme, username=username, password=password, port=port, **kwargs - ) def query_by_embedding( self, @@ -1914,7 +1980,7 @@ def _create_document_index(self, index_name: str, headers: Optional[Dict[str, st if not self.client.indices.exists(index=index_name, headers=headers): raise e - def _get_embedding_field_mapping(self, similarity: Optional[str]): + def _get_embedding_field_mapping(self, similarity: str): space_type = self.similarity_to_space_type[similarity] method: dict = {"space_type": space_type, "name": "hnsw", "engine": "nmslib"} @@ -2049,10 +2115,79 @@ class OpenDistroElasticsearchDocumentStore(OpenSearchDocumentStore): A DocumentStore which has an Open Distro for Elasticsearch service behind it. """ - def __init__(self, similarity="cosine", **kwargs): + def __init__( + self, + scheme: str = "https", + username: str = "admin", + password: str = "admin", + host: Union[str, List[str]] = "localhost", + port: Union[int, List[int]] = 9200, + api_key_id: Optional[str] = None, + api_key: Optional[str] = None, + aws4auth=None, + index: str = "document", + label_index: str = "label", + search_fields: Union[str, list] = "content", + content_field: str = "content", + name_field: str = "name", + embedding_field: str = "embedding", + embedding_dim: int = 768, + custom_mapping: Optional[dict] = None, + excluded_meta_data: Optional[list] = None, + analyzer: str = "standard", + ca_certs: Optional[str] = None, + verify_certs: bool = False, + recreate_index: bool = False, + create_index: bool = True, + refresh_type: str = "wait_for", + similarity: str = "cosine", # Mind this different default param + timeout: int = 30, + return_embedding: bool = False, + duplicate_documents: str = "overwrite", + index_type: str = "flat", + scroll: str = "1d", + skip_missing_embeddings: bool = True, + synonyms: Optional[List] = None, + synonym_type: str = "synonym", + use_system_proxy: bool = False, + ): logger.warning( "Open Distro for Elasticsearch has been replaced by OpenSearch! " "See https://opensearch.org/faq/ for details. " "We recommend using the OpenSearchDocumentStore instead." ) - super(OpenDistroElasticsearchDocumentStore, self).__init__(similarity=similarity, **kwargs) + super().__init__( + scheme=scheme, + username=username, + password=password, + host=host, + port=port, + api_key_id=api_key_id, + api_key=api_key, + aws4auth=aws4auth, + index=index, + label_index=label_index, + search_fields=search_fields, + content_field=content_field, + name_field=name_field, + embedding_field=embedding_field, + embedding_dim=embedding_dim, + custom_mapping=custom_mapping, + excluded_meta_data=excluded_meta_data, + analyzer=analyzer, + ca_certs=ca_certs, + verify_certs=verify_certs, + recreate_index=recreate_index, + create_index=create_index, + refresh_type=refresh_type, + similarity=similarity, + timeout=timeout, + return_embedding=return_embedding, + duplicate_documents=duplicate_documents, + index_type=index_type, + scroll=scroll, + skip_missing_embeddings=skip_missing_embeddings, + synonyms=synonyms, + synonym_type=synonym_type, + use_system_proxy=use_system_proxy, + ) diff --git a/haystack/document_stores/faiss.py b/haystack/document_stores/faiss.py index 0bd53126d7..d69dbdf4e6 100644 --- a/haystack/document_stores/faiss.py +++ b/haystack/document_stores/faiss.py @@ -57,7 +57,9 @@ def __init__( faiss_index_path: Union[str, Path] = None, faiss_config_path: Union[str, Path] = None, isolation_level: str = None, - **kwargs, + n_links: int = 64, + ef_search: int = 20, + ef_construction: int = 80, ): """ :param sql_url: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale @@ -102,12 +104,15 @@ def __init__( :param faiss_config_path: Stored FAISS initial configuration parameters. Can be created via calling `save()` :param isolation_level: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) + :param n_links: used only if index_factory == "HNSW" + :param ef_search: used only if index_factory == "HNSW" + :param ef_construction: used only if index_factory == "HNSW" """ # special case if we want to load an existing index from disk # load init params from disk and run init again if faiss_index_path is not None: sig = signature(self.__class__.__init__) - self._validate_params_load_from_disk(sig, locals(), kwargs) + self._validate_params_load_from_disk(sig, locals()) init_params = self._load_init_params_from_config(faiss_index_path, faiss_config_path) self.__class__.__init__(self, **init_params) # pylint: disable=non-parent-init-called return @@ -141,7 +146,9 @@ def __init__( embedding_dim=self.embedding_dim, index_factory=faiss_index_factory_str, metric_type=self.metric_type, - **kwargs, + n_links=n_links, + ef_search=ef_search, + ef_construction=ef_construction, ) self.return_embedding = return_embedding @@ -155,8 +162,8 @@ def __init__( self._validate_index_sync() - def _validate_params_load_from_disk(self, sig: Signature, locals: dict, kwargs: dict): - allowed_params = ["faiss_index_path", "faiss_config_path", "self", "kwargs"] + def _validate_params_load_from_disk(self, sig: Signature, locals: dict): + allowed_params = ["faiss_index_path", "faiss_config_path", "self"] invalid_param_set = False for param in sig.parameters.values(): @@ -164,7 +171,7 @@ def _validate_params_load_from_disk(self, sig: Signature, locals: dict, kwargs: invalid_param_set = True break - if invalid_param_set or len(kwargs) > 0: + if invalid_param_set: raise ValueError("if faiss_index_path is passed no other params besides faiss_config_path are allowed.") def _validate_index_sync(self): @@ -179,14 +186,21 @@ def _validate_index_sync(self): "was used when creating the original index." ) - def _create_new_index(self, embedding_dim: int, metric_type, index_factory: str = "Flat", **kwargs): + def _create_new_index( + self, + embedding_dim: int, + metric_type, + index_factory: str = "Flat", + n_links: int = 64, + ef_search: int = 20, + ef_construction: int = 80, + ): if index_factory == "HNSW": # faiss index factory doesn't give the same results for HNSW IP, therefore direct init. # defaults here are similar to DPR codebase (good accuracy, but very high RAM consumption) - n_links = kwargs.get("n_links", 64) index = faiss.IndexHNSWFlat(embedding_dim, n_links, metric_type) - index.hnsw.efSearch = kwargs.get("efSearch", 20) # 20 - index.hnsw.efConstruction = kwargs.get("efConstruction", 80) # 80 + index.hnsw.efSearch = ef_search + index.hnsw.efConstruction = ef_construction if "ivf" in index_factory.lower(): # enable reconstruction of vectors for inverted index self.faiss_indexes[index].set_direct_map_type(faiss.DirectMap.Hashtable) diff --git a/haystack/document_stores/milvus1.py b/haystack/document_stores/milvus1.py index 6f76f9fbd5..c1ca75bb25 100644 --- a/haystack/document_stores/milvus1.py +++ b/haystack/document_stores/milvus1.py @@ -60,7 +60,6 @@ def __init__( progress_bar: bool = True, duplicate_documents: str = "overwrite", isolation_level: str = None, - **kwargs, ): """ :param sql_url: SQL connection URL for storing document texts and metadata. It defaults to a local, file based SQLite DB. For large scale @@ -106,7 +105,9 @@ def __init__( exists. :param isolation_level: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) """ - super().__init__() + super().__init__( + url=sql_url, index=index, duplicate_documents=duplicate_documents, isolation_level=isolation_level + ) self.milvus_server = Milvus(uri=milvus_url, pool=connection_pool) @@ -141,10 +142,6 @@ def __init__( self.embedding_field = embedding_field self.progress_bar = progress_bar - super().__init__( - url=sql_url, index=index, duplicate_documents=duplicate_documents, isolation_level=isolation_level - ) - def __del__(self): return self.milvus_server.close() diff --git a/haystack/document_stores/milvus2.py b/haystack/document_stores/milvus2.py index d1f5cf00ec..4b64f90993 100644 --- a/haystack/document_stores/milvus2.py +++ b/haystack/document_stores/milvus2.py @@ -126,7 +126,9 @@ def __init__( exists. :param isolation_level: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) """ - super().__init__() + super().__init__( + url=sql_url, index=index, duplicate_documents=duplicate_documents, isolation_level=isolation_level + ) connections.add_connection(default={"host": host, "port": port}) connections.connect() @@ -171,10 +173,6 @@ def __init__( self.return_embedding = return_embedding self.progress_bar = progress_bar - super().__init__( - url=sql_url, index=index, duplicate_documents=duplicate_documents, isolation_level=isolation_level - ) - def _create_collection_and_index_if_not_exist( self, index: Optional[str] = None, consistency_level: int = 0, index_param: Optional[Dict[str, Any]] = None ): diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index d27c2ba59c..b5fde05d5a 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -81,7 +81,6 @@ def __init__( - `"overwrite"`: Update any existing documents with the same ID when adding documents. - `"fail"`: An error is raised if the document ID of the document being added already exists. """ - # Connect to Pinecone server using python client binding pinecone.init(api_key=api_key, environment=environment) self._api_key = api_key @@ -129,8 +128,6 @@ def __init__( super().__init__(url=sql_url, index=clean_index, duplicate_documents=duplicate_documents) - # self._validate_index_sync() - def _sanitize_index_name(self, index: str) -> str: return index.replace("_", "-").lower() diff --git a/haystack/document_stores/weaviate.py b/haystack/document_stores/weaviate.py index f3c8d78916..c740c3659b 100644 --- a/haystack/document_stores/weaviate.py +++ b/haystack/document_stores/weaviate.py @@ -70,7 +70,6 @@ def __init__( embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", - **kwargs, ): """ :param host: Weaviate server connection URL for storing and processing documents and vectors. diff --git a/haystack/json-schemas/haystack-pipeline-1.2.1rc0.schema.json b/haystack/json-schemas/haystack-pipeline-1.2.1rc0.schema.json index 7e4b003b85..f6d6cc955f 100644 --- a/haystack/json-schemas/haystack-pipeline-1.2.1rc0.schema.json +++ b/haystack/json-schemas/haystack-pipeline-1.2.1rc0.schema.json @@ -15,9 +15,6 @@ }, { "const": "1.3.0" - }, - { - "const": "1.3.1rc0" } ] }, diff --git a/haystack/json-schemas/haystack-pipeline-1.3.1rc0.schema.json b/haystack/json-schemas/haystack-pipeline-1.3.1rc0.schema.json new file mode 100644 index 0000000000..73418303bf --- /dev/null +++ b/haystack/json-schemas/haystack-pipeline-1.3.1rc0.schema.json @@ -0,0 +1,4191 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://haystack.deepset.ai/haystack/json-schemas/haystack-pipeline-1.3.1rc0.schema.json", + "title": "Haystack Pipeline", + "description": "Haystack Pipeline YAML file describing the nodes of the pipelines. For more info read the docs at: https://haystack.deepset.ai/components/pipelines#yaml-file-definitions", + "type": "object", + "properties": { + "version": { + "title": "Version", + "description": "Version of the Haystack Pipeline file.", + "type": "string", + "oneOf": [ + { + "const": "1.3.1rc0" + } + ] + }, + "components": { + "title": "Components", + "description": "Component nodes and their configurations, to later be used in the pipelines section. Define here all the building blocks for the pipelines.", + "type": "array", + "items": { + "anyOf": [ + { + "$ref": "#/definitions/DeepsetCloudDocumentStoreComponent" + }, + { + "$ref": "#/definitions/ElasticsearchDocumentStoreComponent" + }, + { + "$ref": "#/definitions/FAISSDocumentStoreComponent" + }, + { + "$ref": "#/definitions/GraphDBKnowledgeGraphComponent" + }, + { + "$ref": "#/definitions/InMemoryDocumentStoreComponent" + }, + { + "$ref": "#/definitions/Milvus2DocumentStoreComponent" + }, + { + "$ref": "#/definitions/OpenDistroElasticsearchDocumentStoreComponent" + }, + { + "$ref": "#/definitions/OpenSearchDocumentStoreComponent" + }, + { + "$ref": "#/definitions/PineconeDocumentStoreComponent" + }, + { + "$ref": "#/definitions/SQLDocumentStoreComponent" + }, + { + "$ref": "#/definitions/WeaviateDocumentStoreComponent" + }, + { + "$ref": "#/definitions/AzureConverterComponent" + }, + { + "$ref": "#/definitions/CrawlerComponent" + }, + { + "$ref": "#/definitions/DensePassageRetrieverComponent" + }, + { + "$ref": "#/definitions/Docs2AnswersComponent" + }, + { + "$ref": "#/definitions/DocxToTextConverterComponent" + }, + { + "$ref": "#/definitions/ElasticsearchFilterOnlyRetrieverComponent" + }, + { + "$ref": "#/definitions/ElasticsearchRetrieverComponent" + }, + { + "$ref": "#/definitions/EmbeddingRetrieverComponent" + }, + { + "$ref": "#/definitions/EntityExtractorComponent" + }, + { + "$ref": "#/definitions/EvalAnswersComponent" + }, + { + "$ref": "#/definitions/EvalDocumentsComponent" + }, + { + "$ref": "#/definitions/FARMReaderComponent" + }, + { + "$ref": "#/definitions/FileTypeClassifierComponent" + }, + { + "$ref": "#/definitions/ImageToTextConverterComponent" + }, + { + "$ref": "#/definitions/JoinAnswersComponent" + }, + { + "$ref": "#/definitions/JoinDocumentsComponent" + }, + { + "$ref": "#/definitions/MarkdownConverterComponent" + }, + { + "$ref": "#/definitions/PDFToTextConverterComponent" + }, + { + "$ref": "#/definitions/PDFToTextOCRConverterComponent" + }, + { + "$ref": "#/definitions/ParsrConverterComponent" + }, + { + "$ref": "#/definitions/PreProcessorComponent" + }, + { + "$ref": "#/definitions/QuestionGeneratorComponent" + }, + { + "$ref": "#/definitions/RAGeneratorComponent" + }, + { + "$ref": "#/definitions/RCIReaderComponent" + }, + { + "$ref": "#/definitions/RouteDocumentsComponent" + }, + { + "$ref": "#/definitions/SentenceTransformersRankerComponent" + }, + { + "$ref": "#/definitions/Seq2SeqGeneratorComponent" + }, + { + "$ref": "#/definitions/SklearnQueryClassifierComponent" + }, + { + "$ref": "#/definitions/TableReaderComponent" + }, + { + "$ref": "#/definitions/TableTextRetrieverComponent" + }, + { + "$ref": "#/definitions/Text2SparqlRetrieverComponent" + }, + { + "$ref": "#/definitions/TextConverterComponent" + }, + { + "$ref": "#/definitions/TfidfRetrieverComponent" + }, + { + "$ref": "#/definitions/TikaConverterComponent" + }, + { + "$ref": "#/definitions/TransformersDocumentClassifierComponent" + }, + { + "$ref": "#/definitions/TransformersQueryClassifierComponent" + }, + { + "$ref": "#/definitions/TransformersReaderComponent" + }, + { + "$ref": "#/definitions/TransformersSummarizerComponent" + }, + { + "$ref": "#/definitions/TransformersTranslatorComponent" + } + ] + }, + "required": [ + "type", + "name" + ], + "additionalProperties": true + }, + "pipelines": { + "title": "Pipelines", + "description": "Multiple pipelines can be defined using the components from the same YAML file.", + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Name of the pipeline.", + "type": "string" + }, + "nodes": { + "title": "Nodes", + "description": "Nodes to be used by this particular pipeline", + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "The name of this particular node in the pipeline. This should be one of the names from the components defined in the same file.", + "type": "string" + }, + "inputs": { + "title": "Inputs", + "description": "Input parameters for this node.", + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": [ + "name", + "inputs" + ], + "additionalProperties": false + }, + "required": [ + "name", + "nodes" + ], + "additionalProperties": false + }, + "additionalProperties": false + }, + "additionalProperties": false + } + } + }, + "required": [ + "version", + "components", + "pipelines" + ], + "additionalProperties": false, + "definitions": { + "DeepsetCloudDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "DeepsetCloudDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "api_key": { + "title": "Api Key", + "type": "string" + }, + "workspace": { + "title": "Workspace", + "default": "default", + "type": "string" + }, + "index": { + "title": "Index", + "default": "default", + "type": "string" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "api_endpoint": { + "title": "Api Endpoint", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "label_index": { + "title": "Label Index", + "default": "default", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "ElasticsearchDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "ElasticsearchDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "host": { + "title": "Host", + "default": "localhost", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "port": { + "title": "Port", + "default": 9200, + "anyOf": [ + { + "type": "integer" + }, + { + "type": "array", + "items": { + "type": "integer" + } + } + ] + }, + "username": { + "title": "Username", + "default": "", + "type": "string" + }, + "password": { + "title": "Password", + "default": "", + "type": "string" + }, + "api_key_id": { + "title": "Api Key Id", + "type": "string" + }, + "api_key": { + "title": "Api Key", + "type": "string" + }, + "aws4auth": { + "title": "Aws4Auth" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "search_fields": { + "title": "Search Fields", + "default": "content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": {} + } + ] + }, + "content_field": { + "title": "Content Field", + "default": "content", + "type": "string" + }, + "name_field": { + "title": "Name Field", + "default": "name", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "custom_mapping": { + "title": "Custom Mapping", + "type": "object" + }, + "excluded_meta_data": { + "title": "Excluded Meta Data", + "type": "array", + "items": {} + }, + "analyzer": { + "title": "Analyzer", + "default": "standard", + "type": "string" + }, + "scheme": { + "title": "Scheme", + "default": "http", + "type": "string" + }, + "ca_certs": { + "title": "Ca Certs", + "type": "string" + }, + "verify_certs": { + "title": "Verify Certs", + "default": true, + "type": "boolean" + }, + "recreate_index": { + "title": "Recreate Index", + "default": false, + "type": "boolean" + }, + "create_index": { + "title": "Create Index", + "default": true, + "type": "boolean" + }, + "refresh_type": { + "title": "Refresh Type", + "default": "wait_for", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "timeout": { + "title": "Timeout", + "default": 30, + "type": "integer" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "flat", + "type": "string" + }, + "scroll": { + "title": "Scroll", + "default": "1d", + "type": "string" + }, + "skip_missing_embeddings": { + "title": "Skip Missing Embeddings", + "default": true, + "type": "boolean" + }, + "synonyms": { + "title": "Synonyms", + "type": "array", + "items": {} + }, + "synonym_type": { + "title": "Synonym Type", + "default": "synonym", + "type": "string" + }, + "use_system_proxy": { + "title": "Use System Proxy", + "default": false, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "FAISSDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "FAISSDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "sql_url": { + "title": "Sql Url", + "default": "sqlite:///faiss_document_store.db", + "type": "string" + }, + "vector_dim": { + "title": "Vector Dim", + "type": "integer" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "faiss_index_factory_str": { + "title": "Faiss Index Factory Str", + "default": "Flat", + "type": "string" + }, + "faiss_index": { + "title": "Faiss Index", + "type": "string", + "default": null + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "faiss_index_path": { + "title": "Faiss Index Path", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string", + "format": "path" + } + ] + }, + "faiss_config_path": { + "title": "Faiss Config Path", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string", + "format": "path" + } + ] + }, + "isolation_level": { + "title": "Isolation Level", + "type": "string" + }, + "n_links": { + "title": "N Links", + "default": 64, + "type": "integer" + }, + "ef_search": { + "title": "Ef Search", + "default": 20, + "type": "integer" + }, + "ef_construction": { + "title": "Ef Construction", + "default": 80, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "GraphDBKnowledgeGraphComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "GraphDBKnowledgeGraph" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "host": { + "title": "Host", + "default": "localhost", + "type": "string" + }, + "port": { + "title": "Port", + "default": 7200, + "type": "integer" + }, + "username": { + "title": "Username", + "default": "", + "type": "string" + }, + "password": { + "title": "Password", + "default": "", + "type": "string" + }, + "index": { + "title": "Index", + "type": "string" + }, + "prefixes": { + "title": "Prefixes", + "default": "", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "InMemoryDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "InMemoryDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "scoring_batch_size": { + "title": "Scoring Batch Size", + "default": 500000, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "Milvus2DocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "Milvus2DocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "sql_url": { + "title": "Sql Url", + "default": "sqlite:///", + "type": "string" + }, + "host": { + "title": "Host", + "default": "localhost", + "type": "string" + }, + "port": { + "title": "Port", + "default": "19530", + "type": "string" + }, + "connection_pool": { + "title": "Connection Pool", + "default": "SingletonThread", + "type": "string" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "vector_dim": { + "title": "Vector Dim", + "type": "integer" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "index_file_size": { + "title": "Index File Size", + "default": 1024, + "type": "integer" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "IVF_FLAT", + "type": "string" + }, + "index_param": { + "title": "Index Param", + "type": "object" + }, + "search_param": { + "title": "Search Param", + "type": "object" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "id_field": { + "title": "Id Field", + "default": "id", + "type": "string" + }, + "custom_fields": { + "title": "Custom Fields", + "type": "array", + "items": {} + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "isolation_level": { + "title": "Isolation Level", + "type": "string" + }, + "consistency_level": { + "title": "Consistency Level", + "default": 0, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "OpenDistroElasticsearchDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "OpenDistroElasticsearchDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "scheme": { + "title": "Scheme", + "default": "https", + "type": "string" + }, + "username": { + "title": "Username", + "default": "admin", + "type": "string" + }, + "password": { + "title": "Password", + "default": "admin", + "type": "string" + }, + "host": { + "title": "Host", + "default": "localhost", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "port": { + "title": "Port", + "default": 9200, + "anyOf": [ + { + "type": "integer" + }, + { + "type": "array", + "items": { + "type": "integer" + } + } + ] + }, + "api_key_id": { + "title": "Api Key Id", + "type": "string" + }, + "api_key": { + "title": "Api Key", + "type": "string" + }, + "aws4auth": { + "title": "Aws4Auth" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "search_fields": { + "title": "Search Fields", + "default": "content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": {} + } + ] + }, + "content_field": { + "title": "Content Field", + "default": "content", + "type": "string" + }, + "name_field": { + "title": "Name Field", + "default": "name", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "custom_mapping": { + "title": "Custom Mapping", + "type": "object" + }, + "excluded_meta_data": { + "title": "Excluded Meta Data", + "type": "array", + "items": {} + }, + "analyzer": { + "title": "Analyzer", + "default": "standard", + "type": "string" + }, + "ca_certs": { + "title": "Ca Certs", + "type": "string" + }, + "verify_certs": { + "title": "Verify Certs", + "default": false, + "type": "boolean" + }, + "recreate_index": { + "title": "Recreate Index", + "default": false, + "type": "boolean" + }, + "create_index": { + "title": "Create Index", + "default": true, + "type": "boolean" + }, + "refresh_type": { + "title": "Refresh Type", + "default": "wait_for", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "cosine", + "type": "string" + }, + "timeout": { + "title": "Timeout", + "default": 30, + "type": "integer" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "flat", + "type": "string" + }, + "scroll": { + "title": "Scroll", + "default": "1d", + "type": "string" + }, + "skip_missing_embeddings": { + "title": "Skip Missing Embeddings", + "default": true, + "type": "boolean" + }, + "synonyms": { + "title": "Synonyms", + "type": "array", + "items": {} + }, + "synonym_type": { + "title": "Synonym Type", + "default": "synonym", + "type": "string" + }, + "use_system_proxy": { + "title": "Use System Proxy", + "default": false, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "OpenSearchDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "OpenSearchDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "scheme": { + "title": "Scheme", + "default": "https", + "type": "string" + }, + "username": { + "title": "Username", + "default": "admin", + "type": "string" + }, + "password": { + "title": "Password", + "default": "admin", + "type": "string" + }, + "host": { + "title": "Host", + "default": "localhost", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "port": { + "title": "Port", + "default": 9200, + "anyOf": [ + { + "type": "integer" + }, + { + "type": "array", + "items": { + "type": "integer" + } + } + ] + }, + "api_key_id": { + "title": "Api Key Id", + "type": "string" + }, + "api_key": { + "title": "Api Key", + "type": "string" + }, + "aws4auth": { + "title": "Aws4Auth" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "search_fields": { + "title": "Search Fields", + "default": "content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": {} + } + ] + }, + "content_field": { + "title": "Content Field", + "default": "content", + "type": "string" + }, + "name_field": { + "title": "Name Field", + "default": "name", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "custom_mapping": { + "title": "Custom Mapping", + "type": "object" + }, + "excluded_meta_data": { + "title": "Excluded Meta Data", + "type": "array", + "items": {} + }, + "analyzer": { + "title": "Analyzer", + "default": "standard", + "type": "string" + }, + "ca_certs": { + "title": "Ca Certs", + "type": "string" + }, + "verify_certs": { + "title": "Verify Certs", + "default": false, + "type": "boolean" + }, + "recreate_index": { + "title": "Recreate Index", + "default": false, + "type": "boolean" + }, + "create_index": { + "title": "Create Index", + "default": true, + "type": "boolean" + }, + "refresh_type": { + "title": "Refresh Type", + "default": "wait_for", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "timeout": { + "title": "Timeout", + "default": 30, + "type": "integer" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "flat", + "type": "string" + }, + "scroll": { + "title": "Scroll", + "default": "1d", + "type": "string" + }, + "skip_missing_embeddings": { + "title": "Skip Missing Embeddings", + "default": true, + "type": "boolean" + }, + "synonyms": { + "title": "Synonyms", + "type": "array", + "items": {} + }, + "synonym_type": { + "title": "Synonym Type", + "default": "synonym", + "type": "string" + }, + "use_system_proxy": { + "title": "Use System Proxy", + "default": false, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "PineconeDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "PineconeDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "api_key": { + "title": "Api Key", + "type": "string" + }, + "environment": { + "title": "Environment", + "default": "us-west1-gcp", + "type": "string" + }, + "sql_url": { + "title": "Sql Url", + "default": "sqlite:///pinecone_document_store.db", + "type": "string" + }, + "pinecone_index": { + "title": "Pinecone Index", + "type": "string", + "default": null + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "cosine", + "type": "string" + }, + "replicas": { + "title": "Replicas", + "default": 1, + "type": "integer" + }, + "shards": { + "title": "Shards", + "default": 1, + "type": "integer" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + } + }, + "required": [ + "api_key" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "SQLDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "SQLDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "url": { + "title": "Url", + "default": "sqlite://", + "type": "string" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "check_same_thread": { + "title": "Check Same Thread", + "default": false, + "type": "boolean" + }, + "isolation_level": { + "title": "Isolation Level", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "WeaviateDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "WeaviateDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "host": { + "title": "Host", + "default": "http://localhost", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "port": { + "title": "Port", + "default": 8080, + "anyOf": [ + { + "type": "integer" + }, + { + "type": "array", + "items": { + "type": "integer" + } + } + ] + }, + "timeout_config": { + "title": "Timeout Config", + "default": [ + 5, + 15 + ], + "type": "array", + "items": {} + }, + "username": { + "title": "Username", + "type": "string" + }, + "password": { + "title": "Password", + "type": "string" + }, + "index": { + "title": "Index", + "default": "Document", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "content_field": { + "title": "Content Field", + "default": "content", + "type": "string" + }, + "name_field": { + "title": "Name Field", + "default": "name", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "cosine", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "hnsw", + "type": "string" + }, + "custom_schema": { + "title": "Custom Schema", + "type": "object" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "AzureConverterComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "AzureConverter" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "endpoint": { + "title": "Endpoint", + "type": "string" + }, + "credential_key": { + "title": "Credential Key", + "type": "string" + }, + "model_id": { + "title": "Model Id", + "default": "prebuilt-document", + "type": "string" + }, + "valid_languages": { + "title": "Valid Languages", + "type": "array", + "items": { + "type": "string" + } + }, + "save_json": { + "title": "Save Json", + "default": false, + "type": "boolean" + }, + "preceding_context_len": { + "title": "Preceding Context Len", + "default": 3, + "type": "integer" + }, + "following_context_len": { + "title": "Following Context Len", + "default": 3, + "type": "integer" + }, + "merge_multiple_column_headers": { + "title": "Merge Multiple Column Headers", + "default": true, + "type": "boolean" + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": [ + "endpoint", + "credential_key" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "CrawlerComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "Crawler" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "output_dir": { + "title": "Output Dir", + "type": "string" + }, + "urls": { + "title": "Urls", + "type": "array", + "items": { + "type": "string" + } + }, + "crawler_depth": { + "title": "Crawler Depth", + "default": 1, + "type": "integer" + }, + "filter_urls": { + "title": "Filter Urls", + "type": "array", + "items": {} + }, + "overwrite_existing_files": { + "title": "Overwrite Existing Files", + "default": true + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": [ + "output_dir" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "DensePassageRetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "DensePassageRetriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "document_store": { + "title": "Document Store", + "type": "string" + }, + "query_embedding_model": { + "title": "Query Embedding Model", + "default": "facebook/dpr-question_encoder-single-nq-base", + "anyOf": [ + { + "type": "string", + "format": "path" + }, + { + "type": "string" + } + ] + }, + "passage_embedding_model": { + "title": "Passage Embedding Model", + "default": "facebook/dpr-ctx_encoder-single-nq-base", + "anyOf": [ + { + "type": "string", + "format": "path" + }, + { + "type": "string" + } + ] + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "max_seq_len_query": { + "title": "Max Seq Len Query", + "default": 64, + "type": "integer" + }, + "max_seq_len_passage": { + "title": "Max Seq Len Passage", + "default": 256, + "type": "integer" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "batch_size": { + "title": "Batch Size", + "default": 16, + "type": "integer" + }, + "embed_title": { + "title": "Embed Title", + "default": true, + "type": "boolean" + }, + "use_fast_tokenizers": { + "title": "Use Fast Tokenizers", + "default": true, + "type": "boolean" + }, + "infer_tokenizer_classes": { + "title": "Infer Tokenizer Classes", + "default": false, + "type": "boolean" + }, + "similarity_function": { + "title": "Similarity Function", + "default": "dot_product", + "type": "string" + }, + "global_loss_buffer_size": { + "title": "Global Loss Buffer Size", + "default": 150000, + "type": "integer" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "devices": { + "title": "Devices", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + "use_auth_token": { + "title": "Use Auth Token", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "string" + } + ] + } + }, + "required": [ + "document_store" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "Docs2AnswersComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "Docs2Answers" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": {}, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "DocxToTextConverterComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "DocxToTextConverter" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "remove_numeric_tables": { + "title": "Remove Numeric Tables", + "default": false, + "type": "boolean" + }, + "valid_languages": { + "title": "Valid Languages", + "type": "array", + "items": { + "type": "string" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "ElasticsearchFilterOnlyRetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "ElasticsearchFilterOnlyRetriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "document_store": { + "title": "Document Store", + "type": "string" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "all_terms_must_match": { + "title": "All Terms Must Match", + "default": false, + "type": "boolean" + }, + "custom_query": { + "title": "Custom Query", + "type": "string" + } + }, + "required": [ + "document_store" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "ElasticsearchRetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "ElasticsearchRetriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "document_store": { + "title": "Document Store", + "type": "string" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "all_terms_must_match": { + "title": "All Terms Must Match", + "default": false, + "type": "boolean" + }, + "custom_query": { + "title": "Custom Query", + "type": "string" + } + }, + "required": [ + "document_store" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "EmbeddingRetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "EmbeddingRetriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "document_store": { + "title": "Document Store", + "type": "string" + }, + "embedding_model": { + "title": "Embedding Model", + "type": "string" + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "batch_size": { + "title": "Batch Size", + "default": 32, + "type": "integer" + }, + "max_seq_len": { + "title": "Max Seq Len", + "default": 512, + "type": "integer" + }, + "model_format": { + "title": "Model Format", + "default": "farm", + "type": "string" + }, + "pooling_strategy": { + "title": "Pooling Strategy", + "default": "reduce_mean", + "type": "string" + }, + "emb_extraction_layer": { + "title": "Emb Extraction Layer", + "default": -1, + "type": "integer" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "devices": { + "title": "Devices", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + "use_auth_token": { + "title": "Use Auth Token", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "string" + } + ] + } + }, + "required": [ + "document_store", + "embedding_model" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "EntityExtractorComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "EntityExtractor" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "dslim/bert-base-NER", + "type": "string" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "EvalAnswersComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "EvalAnswers" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "skip_incorrect_retrieval": { + "title": "Skip Incorrect Retrieval", + "default": true, + "type": "boolean" + }, + "open_domain": { + "title": "Open Domain", + "default": true, + "type": "boolean" + }, + "sas_model": { + "title": "Sas Model", + "type": "string" + }, + "debug": { + "title": "Debug", + "default": false, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "EvalDocumentsComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "EvalDocuments" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "debug": { + "title": "Debug", + "default": false, + "type": "boolean" + }, + "open_domain": { + "title": "Open Domain", + "default": true, + "type": "boolean" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "FARMReaderComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "FARMReader" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "type": "string" + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "context_window_size": { + "title": "Context Window Size", + "default": 150, + "type": "integer" + }, + "batch_size": { + "title": "Batch Size", + "default": 50, + "type": "integer" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "devices": { + "title": "Devices", + "default": [], + "type": "array", + "items": { + "type": "string" + } + }, + "no_ans_boost": { + "title": "No Ans Boost", + "default": 0.0, + "type": "number" + }, + "return_no_answer": { + "title": "Return No Answer", + "default": false, + "type": "boolean" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "top_k_per_candidate": { + "title": "Top K Per Candidate", + "default": 3, + "type": "integer" + }, + "top_k_per_sample": { + "title": "Top K Per Sample", + "default": 1, + "type": "integer" + }, + "num_processes": { + "title": "Num Processes", + "type": "integer" + }, + "max_seq_len": { + "title": "Max Seq Len", + "default": 256, + "type": "integer" + }, + "doc_stride": { + "title": "Doc Stride", + "default": 128, + "type": "integer" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_filtering": { + "title": "Duplicate Filtering", + "default": 0, + "type": "integer" + }, + "use_confidence_scores": { + "title": "Use Confidence Scores", + "default": true, + "type": "boolean" + }, + "confidence_threshold": { + "title": "Confidence Threshold", + "type": "number" + }, + "proxies": { + "title": "Proxies", + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "local_files_only": { + "title": "Local Files Only", + "default": false + }, + "force_download": { + "title": "Force Download", + "default": false + }, + "use_auth_token": { + "title": "Use Auth Token", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "string" + } + ] + } + }, + "required": [ + "model_name_or_path" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "FileTypeClassifierComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "FileTypeClassifier" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "supported_types": { + "title": "Supported Types", + "default": [ + "txt", + "pdf", + "md", + "docx", + "html" + ], + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "ImageToTextConverterComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "ImageToTextConverter" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "remove_numeric_tables": { + "title": "Remove Numeric Tables", + "default": false, + "type": "boolean" + }, + "valid_languages": { + "title": "Valid Languages", + "default": [ + "eng" + ], + "type": "array", + "items": { + "type": "string" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "JoinAnswersComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "JoinAnswers" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "join_mode": { + "title": "Join Mode", + "default": "concatenate", + "type": "string" + }, + "weights": { + "title": "Weights", + "type": "array", + "items": { + "type": "number" + } + }, + "top_k_join": { + "title": "Top K Join", + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "JoinDocumentsComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "JoinDocuments" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "join_mode": { + "title": "Join Mode", + "default": "concatenate", + "type": "string" + }, + "weights": { + "title": "Weights", + "type": "array", + "items": { + "type": "number" + } + }, + "top_k_join": { + "title": "Top K Join", + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "MarkdownConverterComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "MarkdownConverter" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "remove_numeric_tables": { + "title": "Remove Numeric Tables", + "default": false, + "type": "boolean" + }, + "valid_languages": { + "title": "Valid Languages", + "type": "array", + "items": { + "type": "string" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "PDFToTextConverterComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "PDFToTextConverter" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "remove_numeric_tables": { + "title": "Remove Numeric Tables", + "default": false, + "type": "boolean" + }, + "valid_languages": { + "title": "Valid Languages", + "type": "array", + "items": { + "type": "string" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "PDFToTextOCRConverterComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "PDFToTextOCRConverter" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "remove_numeric_tables": { + "title": "Remove Numeric Tables", + "default": false, + "type": "boolean" + }, + "valid_languages": { + "title": "Valid Languages", + "default": [ + "eng" + ], + "type": "array", + "items": { + "type": "string" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "ParsrConverterComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "ParsrConverter" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "parsr_url": { + "title": "Parsr Url", + "default": "http://localhost:3001", + "type": "string" + }, + "extractor": { + "title": "Extractor", + "default": "pdfminer", + "enum": [ + "pdfminer", + "pdfjs" + ], + "type": "string" + }, + "table_detection_mode": { + "title": "Table Detection Mode", + "default": "lattice", + "enum": [ + "lattice", + "stream" + ], + "type": "string" + }, + "preceding_context_len": { + "title": "Preceding Context Len", + "default": 3, + "type": "integer" + }, + "following_context_len": { + "title": "Following Context Len", + "default": 3, + "type": "integer" + }, + "remove_page_headers": { + "title": "Remove Page Headers", + "default": false, + "type": "boolean" + }, + "remove_page_footers": { + "title": "Remove Page Footers", + "default": false, + "type": "boolean" + }, + "remove_table_of_contents": { + "title": "Remove Table Of Contents", + "default": false, + "type": "boolean" + }, + "valid_languages": { + "title": "Valid Languages", + "type": "array", + "items": { + "type": "string" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "PreProcessorComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "PreProcessor" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "clean_whitespace": { + "title": "Clean Whitespace", + "default": true, + "type": "boolean" + }, + "clean_header_footer": { + "title": "Clean Header Footer", + "default": false, + "type": "boolean" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines", + "default": true, + "type": "boolean" + }, + "remove_substrings": { + "title": "Remove Substrings", + "default": [], + "type": "array", + "items": { + "type": "string" + } + }, + "split_by": { + "title": "Split By", + "default": "word", + "type": "string" + }, + "split_length": { + "title": "Split Length", + "default": 200, + "type": "integer" + }, + "split_overlap": { + "title": "Split Overlap", + "default": 0, + "type": "integer" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary", + "default": true, + "type": "boolean" + }, + "language": { + "title": "Language", + "default": "en", + "type": "string" + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "QuestionGeneratorComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "QuestionGenerator" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "valhalla/t5-base-e2e-qg" + }, + "model_version": { + "title": "Model Version" + }, + "num_beams": { + "title": "Num Beams", + "default": 4 + }, + "max_length": { + "title": "Max Length", + "default": 256 + }, + "no_repeat_ngram_size": { + "title": "No Repeat Ngram Size", + "default": 3 + }, + "length_penalty": { + "title": "Length Penalty", + "default": 1.5 + }, + "early_stopping": { + "title": "Early Stopping", + "default": true + }, + "split_length": { + "title": "Split Length", + "default": 50 + }, + "split_overlap": { + "title": "Split Overlap", + "default": 10 + }, + "use_gpu": { + "title": "Use Gpu", + "default": true + }, + "prompt": { + "title": "Prompt", + "default": "generate questions:" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "RAGeneratorComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "RAGenerator" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "facebook/rag-token-nq", + "type": "string" + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "retriever": { + "title": "Retriever", + "type": "string", + "default": null + }, + "generator_type": { + "title": "Generator Type", + "default": "token", + "type": "string" + }, + "top_k": { + "title": "Top K", + "default": 2, + "type": "integer" + }, + "max_length": { + "title": "Max Length", + "default": 200, + "type": "integer" + }, + "min_length": { + "title": "Min Length", + "default": 2, + "type": "integer" + }, + "num_beams": { + "title": "Num Beams", + "default": 2, + "type": "integer" + }, + "embed_title": { + "title": "Embed Title", + "default": true, + "type": "boolean" + }, + "prefix": { + "title": "Prefix", + "type": "string" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "RCIReaderComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "RCIReader" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "row_model_name_or_path": { + "title": "Row Model Name Or Path", + "default": "michaelrglass/albert-base-rci-wikisql-row", + "type": "string" + }, + "column_model_name_or_path": { + "title": "Column Model Name Or Path", + "default": "michaelrglass/albert-base-rci-wikisql-col", + "type": "string" + }, + "row_model_version": { + "title": "Row Model Version", + "type": "string" + }, + "column_model_version": { + "title": "Column Model Version", + "type": "string" + }, + "row_tokenizer": { + "title": "Row Tokenizer", + "type": "string" + }, + "column_tokenizer": { + "title": "Column Tokenizer", + "type": "string" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "max_seq_len": { + "title": "Max Seq Len", + "default": 256, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "RouteDocumentsComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "RouteDocuments" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "split_by": { + "title": "Split By", + "default": "content_type", + "type": "string" + }, + "metadata_values": { + "title": "Metadata Values", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "SentenceTransformersRankerComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "SentenceTransformersRanker" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string", + "format": "path" + } + ] + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "devices": { + "title": "Devices", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + } + }, + "required": [ + "model_name_or_path" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "Seq2SeqGeneratorComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "Seq2SeqGenerator" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "type": "string" + }, + "input_converter": { + "title": "Input Converter", + "type": "string", + "default": null + }, + "top_k": { + "title": "Top K", + "default": 1, + "type": "integer" + }, + "max_length": { + "title": "Max Length", + "default": 200, + "type": "integer" + }, + "min_length": { + "title": "Min Length", + "default": 2, + "type": "integer" + }, + "num_beams": { + "title": "Num Beams", + "default": 8, + "type": "integer" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + } + }, + "required": [ + "model_name_or_path" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "SklearnQueryClassifierComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "SklearnQueryClassifier" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle", + "anyOf": [ + { + "type": "string" + }, + {} + ] + }, + "vectorizer_name_or_path": { + "title": "Vectorizer Name Or Path", + "default": "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle", + "anyOf": [ + { + "type": "string" + }, + {} + ] + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TableReaderComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TableReader" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "google/tapas-base-finetuned-wtq", + "type": "string" + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "tokenizer": { + "title": "Tokenizer", + "type": "string" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "top_k_per_candidate": { + "title": "Top K Per Candidate", + "default": 3, + "type": "integer" + }, + "return_no_answer": { + "title": "Return No Answer", + "default": false, + "type": "boolean" + }, + "max_seq_len": { + "title": "Max Seq Len", + "default": 256, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TableTextRetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TableTextRetriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "document_store": { + "title": "Document Store", + "type": "string" + }, + "query_embedding_model": { + "title": "Query Embedding Model", + "default": "deepset/bert-small-mm_retrieval-question_encoder", + "anyOf": [ + { + "type": "string", + "format": "path" + }, + { + "type": "string" + } + ] + }, + "passage_embedding_model": { + "title": "Passage Embedding Model", + "default": "deepset/bert-small-mm_retrieval-passage_encoder", + "anyOf": [ + { + "type": "string", + "format": "path" + }, + { + "type": "string" + } + ] + }, + "table_embedding_model": { + "title": "Table Embedding Model", + "default": "deepset/bert-small-mm_retrieval-table_encoder", + "anyOf": [ + { + "type": "string", + "format": "path" + }, + { + "type": "string" + } + ] + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "max_seq_len_query": { + "title": "Max Seq Len Query", + "default": 64, + "type": "integer" + }, + "max_seq_len_passage": { + "title": "Max Seq Len Passage", + "default": 256, + "type": "integer" + }, + "max_seq_len_table": { + "title": "Max Seq Len Table", + "default": 256, + "type": "integer" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "batch_size": { + "title": "Batch Size", + "default": 16, + "type": "integer" + }, + "embed_meta_fields": { + "title": "Embed Meta Fields", + "default": [ + "name", + "section_title", + "caption" + ], + "type": "array", + "items": { + "type": "string" + } + }, + "use_fast_tokenizers": { + "title": "Use Fast Tokenizers", + "default": true, + "type": "boolean" + }, + "infer_tokenizer_classes": { + "title": "Infer Tokenizer Classes", + "default": false, + "type": "boolean" + }, + "similarity_function": { + "title": "Similarity Function", + "default": "dot_product", + "type": "string" + }, + "global_loss_buffer_size": { + "title": "Global Loss Buffer Size", + "default": 150000, + "type": "integer" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "devices": { + "title": "Devices", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + "use_auth_token": { + "title": "Use Auth Token", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "string" + } + ] + } + }, + "required": [ + "document_store" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "Text2SparqlRetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "Text2SparqlRetriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "knowledge_graph": { + "title": "Knowledge Graph" + }, + "model_name_or_path": { + "title": "Model Name Or Path" + }, + "top_k": { + "title": "Top K", + "default": 1, + "type": "integer" + } + }, + "required": [ + "knowledge_graph", + "model_name_or_path" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TextConverterComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TextConverter" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "remove_numeric_tables": { + "title": "Remove Numeric Tables", + "default": false, + "type": "boolean" + }, + "valid_languages": { + "title": "Valid Languages", + "type": "array", + "items": { + "type": "string" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TfidfRetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TfidfRetriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "document_store": { + "title": "Document Store", + "type": "string" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "auto_fit": { + "title": "Auto Fit", + "default": true + } + }, + "required": [ + "document_store" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TikaConverterComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TikaConverter" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "tika_url": { + "title": "Tika Url", + "default": "http://localhost:9998/tika", + "type": "string" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables", + "default": false, + "type": "boolean" + }, + "valid_languages": { + "title": "Valid Languages", + "type": "array", + "items": { + "type": "string" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TransformersDocumentClassifierComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TransformersDocumentClassifier" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "bhadresh-savani/distilbert-base-uncased-emotion", + "type": "string" + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "tokenizer": { + "title": "Tokenizer", + "type": "string" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "return_all_scores": { + "title": "Return All Scores", + "default": false, + "type": "boolean" + }, + "task": { + "title": "Task", + "default": "text-classification", + "type": "string" + }, + "labels": { + "title": "Labels", + "type": "array", + "items": { + "type": "string" + } + }, + "batch_size": { + "title": "Batch Size", + "default": -1, + "type": "integer" + }, + "classification_field": { + "title": "Classification Field", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TransformersQueryClassifierComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TransformersQueryClassifier" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "shahrukhx01/bert-mini-finetune-question-detection", + "anyOf": [ + { + "type": "string", + "format": "path" + }, + { + "type": "string" + } + ] + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TransformersReaderComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TransformersReader" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "distilbert-base-uncased-distilled-squad", + "type": "string" + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "tokenizer": { + "title": "Tokenizer", + "type": "string" + }, + "context_window_size": { + "title": "Context Window Size", + "default": 70, + "type": "integer" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "top_k_per_candidate": { + "title": "Top K Per Candidate", + "default": 4, + "type": "integer" + }, + "return_no_answers": { + "title": "Return No Answers", + "default": true, + "type": "boolean" + }, + "max_seq_len": { + "title": "Max Seq Len", + "default": 256, + "type": "integer" + }, + "doc_stride": { + "title": "Doc Stride", + "default": 128, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TransformersSummarizerComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TransformersSummarizer" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "google/pegasus-xsum", + "type": "string" + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "tokenizer": { + "title": "Tokenizer", + "type": "string" + }, + "max_length": { + "title": "Max Length", + "default": 200, + "type": "integer" + }, + "min_length": { + "title": "Min Length", + "default": 5, + "type": "integer" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "clean_up_tokenization_spaces": { + "title": "Clean Up Tokenization Spaces", + "default": true, + "type": "boolean" + }, + "separator_for_single_summary": { + "title": "Separator For Single Summary", + "default": " ", + "type": "string" + }, + "generate_single_summary": { + "title": "Generate Single Summary", + "default": false, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TransformersTranslatorComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TransformersTranslator" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "type": "string" + }, + "tokenizer_name": { + "title": "Tokenizer Name", + "type": "string" + }, + "max_seq_len": { + "title": "Max Seq Len", + "type": "integer" + }, + "clean_up_tokenization_spaces": { + "title": "Clean Up Tokenization Spaces", + "default": true, + "type": "boolean" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + } + }, + "required": [ + "model_name_or_path" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + } + } +} \ No newline at end of file diff --git a/haystack/json-schemas/haystack-pipeline-unstable.schema.json b/haystack/json-schemas/haystack-pipeline-unstable.schema.json index 08541b0456..c480fbf584 100644 --- a/haystack/json-schemas/haystack-pipeline-unstable.schema.json +++ b/haystack/json-schemas/haystack-pipeline-unstable.schema.json @@ -13,12 +13,6 @@ { "const": "unstable" }, - { - "const": "1.2.1rc0" - }, - { - "const": "1.3.0" - }, { "const": "1.3.1rc0" } @@ -470,11 +464,13 @@ }, "similarity": { "title": "Similarity", - "default": "dot_product" + "default": "dot_product", + "type": "string" }, "timeout": { "title": "Timeout", - "default": 30 + "default": 30, + "type": "integer" }, "return_embedding": { "title": "Return Embedding", @@ -626,6 +622,21 @@ "isolation_level": { "title": "Isolation Level", "type": "string" + }, + "n_links": { + "title": "N Links", + "default": 64, + "type": "integer" + }, + "ef_search": { + "title": "Ef Search", + "default": 20, + "type": "integer" + }, + "ef_construction": { + "title": "Ef Construction", + "default": 80, + "type": "integer" } }, "additionalProperties": false, @@ -918,9 +929,192 @@ "title": "Parameters", "type": "object", "properties": { + "scheme": { + "title": "Scheme", + "default": "https", + "type": "string" + }, + "username": { + "title": "Username", + "default": "admin", + "type": "string" + }, + "password": { + "title": "Password", + "default": "admin", + "type": "string" + }, + "host": { + "title": "Host", + "default": "localhost", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "port": { + "title": "Port", + "default": 9200, + "anyOf": [ + { + "type": "integer" + }, + { + "type": "array", + "items": { + "type": "integer" + } + } + ] + }, + "api_key_id": { + "title": "Api Key Id", + "type": "string" + }, + "api_key": { + "title": "Api Key", + "type": "string" + }, + "aws4auth": { + "title": "Aws4Auth" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "search_fields": { + "title": "Search Fields", + "default": "content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": {} + } + ] + }, + "content_field": { + "title": "Content Field", + "default": "content", + "type": "string" + }, + "name_field": { + "title": "Name Field", + "default": "name", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "custom_mapping": { + "title": "Custom Mapping", + "type": "object" + }, + "excluded_meta_data": { + "title": "Excluded Meta Data", + "type": "array", + "items": {} + }, + "analyzer": { + "title": "Analyzer", + "default": "standard", + "type": "string" + }, + "ca_certs": { + "title": "Ca Certs", + "type": "string" + }, + "verify_certs": { + "title": "Verify Certs", + "default": false, + "type": "boolean" + }, + "recreate_index": { + "title": "Recreate Index", + "default": false, + "type": "boolean" + }, + "create_index": { + "title": "Create Index", + "default": true, + "type": "boolean" + }, + "refresh_type": { + "title": "Refresh Type", + "default": "wait_for", + "type": "string" + }, "similarity": { "title": "Similarity", - "default": "cosine" + "default": "cosine", + "type": "string" + }, + "timeout": { + "title": "Timeout", + "default": 30, + "type": "integer" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "flat", + "type": "string" + }, + "scroll": { + "title": "Scroll", + "default": "1d", + "type": "string" + }, + "skip_missing_embeddings": { + "title": "Skip Missing Embeddings", + "default": true, + "type": "boolean" + }, + "synonyms": { + "title": "Synonyms", + "type": "array", + "items": {} + }, + "synonym_type": { + "title": "Synonym Type", + "default": "synonym", + "type": "string" + }, + "use_system_proxy": { + "title": "Use System Proxy", + "default": false, + "type": "boolean" } }, "additionalProperties": false, @@ -951,25 +1145,192 @@ "title": "Parameters", "type": "object", "properties": { - "verify_certs": { - "title": "Verify Certs", - "default": false - }, "scheme": { "title": "Scheme", - "default": "https" + "default": "https", + "type": "string" }, "username": { "title": "Username", - "default": "admin" + "default": "admin", + "type": "string" }, "password": { "title": "Password", - "default": "admin" + "default": "admin", + "type": "string" + }, + "host": { + "title": "Host", + "default": "localhost", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] }, "port": { "title": "Port", - "default": 9200 + "default": 9200, + "anyOf": [ + { + "type": "integer" + }, + { + "type": "array", + "items": { + "type": "integer" + } + } + ] + }, + "api_key_id": { + "title": "Api Key Id", + "type": "string" + }, + "api_key": { + "title": "Api Key", + "type": "string" + }, + "aws4auth": { + "title": "Aws4Auth" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "search_fields": { + "title": "Search Fields", + "default": "content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": {} + } + ] + }, + "content_field": { + "title": "Content Field", + "default": "content", + "type": "string" + }, + "name_field": { + "title": "Name Field", + "default": "name", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "custom_mapping": { + "title": "Custom Mapping", + "type": "object" + }, + "excluded_meta_data": { + "title": "Excluded Meta Data", + "type": "array", + "items": {} + }, + "analyzer": { + "title": "Analyzer", + "default": "standard", + "type": "string" + }, + "ca_certs": { + "title": "Ca Certs", + "type": "string" + }, + "verify_certs": { + "title": "Verify Certs", + "default": false, + "type": "boolean" + }, + "recreate_index": { + "title": "Recreate Index", + "default": false, + "type": "boolean" + }, + "create_index": { + "title": "Create Index", + "default": true, + "type": "boolean" + }, + "refresh_type": { + "title": "Refresh Type", + "default": "wait_for", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "timeout": { + "title": "Timeout", + "default": 30, + "type": "integer" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "flat", + "type": "string" + }, + "scroll": { + "title": "Scroll", + "default": "1d", + "type": "string" + }, + "skip_missing_embeddings": { + "title": "Skip Missing Embeddings", + "default": true, + "type": "boolean" + }, + "synonyms": { + "title": "Synonyms", + "type": "array", + "items": {} + }, + "synonym_type": { + "title": "Synonym Type", + "default": "synonym", + "type": "string" + }, + "use_system_proxy": { + "title": "Use System Proxy", + "default": false, + "type": "boolean" } }, "additionalProperties": false, diff --git a/haystack/json-schemas/haystack-pipeline.schema.json b/haystack/json-schemas/haystack-pipeline.schema.json index bfc8490ad4..5418bc1186 100644 --- a/haystack/json-schemas/haystack-pipeline.schema.json +++ b/haystack/json-schemas/haystack-pipeline.schema.json @@ -58,7 +58,22 @@ }, { "const": "1.3.0" - }, + } + ] + } + } + }, + { + "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/json-schemas/haystack-pipeline-1.2.1rc0.schema.json" + } + ] + }, + { + "allOf": [ + { + "properties": { + "version": { + "oneOf": [ { "const": "1.3.1rc0" } @@ -67,7 +82,7 @@ } }, { - "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/json-schemas/haystack-pipeline-1.2.1rc0.schema.json" + "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/json-schemas/haystack-pipeline-1.3.1rc0.schema.json" } ] } diff --git a/haystack/nodes/_json_schema.py b/haystack/nodes/_json_schema.py index f9da2ebbf4..02de56af37 100644 --- a/haystack/nodes/_json_schema.py +++ b/haystack/nodes/_json_schema.py @@ -154,6 +154,13 @@ def create_schema_for_node_class(node_class: Type[BaseComponent]) -> Tuple[Dict[ raise PipelineSchemaError(f"Could not read the __init__ method of {node_name} to create its schema.") signature = get_typed_signature(init_method) + + # Check for variadic parameters (*args or **kwargs) and raise an exception if found + if any(param.kind in {param.VAR_POSITIONAL, param.VAR_KEYWORD} for param in signature.parameters.values()): + raise PipelineSchemaError( + "Nodes cannot use variadic parameters like *args or **kwargs in their __init__ function." + ) + param_fields = [ param for param in signature.parameters.values() if param.kind not in {param.VAR_POSITIONAL, param.VAR_KEYWORD} ] diff --git a/haystack/nodes/reader/farm.py b/haystack/nodes/reader/farm.py index 2083f31be1..ff10f93cba 100644 --- a/haystack/nodes/reader/farm.py +++ b/haystack/nodes/reader/farm.py @@ -62,7 +62,6 @@ def __init__( local_files_only=False, force_download=False, use_auth_token: Optional[Union[str, bool]] = None, - **kwargs, ): """ @@ -140,7 +139,6 @@ def __init__( force_download=force_download, devices=self.devices, use_auth_token=use_auth_token, - **kwargs, ) self.inferencer.model.prediction_heads[0].context_window_size = context_window_size self.inferencer.model.prediction_heads[0].no_ans_boost = no_ans_boost diff --git a/test/conftest.py b/test/conftest.py index b598d8e655..8760821a26 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -842,9 +842,7 @@ def get_document_store( ) elif document_store_type == "weaviate": - document_store = WeaviateDocumentStore( - weaviate_url="http://localhost:8080", index=index, similarity=similarity, embedding_dim=embedding_dim - ) + document_store = WeaviateDocumentStore(index=index, similarity=similarity, embedding_dim=embedding_dim) document_store.weaviate_client.schema.delete_all() document_store._create_schema_and_index_if_not_exist() diff --git a/test/test_pipeline_yaml.py b/test/test_pipeline_yaml.py index 105fdc1e4f..6f1bfb86b9 100644 --- a/test/test_pipeline_yaml.py +++ b/test/test_pipeline_yaml.py @@ -667,6 +667,72 @@ def __init__(self, some_exotic_parameter: str): Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") +def test_load_yaml_custom_component_with_variadic_args(tmp_path): + class BaseCustomNode(MockNode): + def __init__(self, base_parameter: int): + super().__init__() + self.base_parameter = base_parameter + + class CustomNode(BaseCustomNode): + def __init__(self, some_parameter: str, *args): + super().__init__(*args) + self.some_parameter = some_parameter + + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + - name: custom_node + type: CustomNode + params: + base_parameter: 1 + some_parameter: value + pipelines: + - name: my_pipeline + nodes: + - name: custom_node + inputs: + - Query + """ + ) + with pytest.raises(PipelineSchemaError, match="variadic"): + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + + +def test_load_yaml_custom_component_with_variadic_kwargs(tmp_path): + class BaseCustomNode(MockNode): + def __init__(self, base_parameter: int): + super().__init__() + self.base_parameter = base_parameter + + class CustomNode(BaseCustomNode): + def __init__(self, some_parameter: str, **kwargs): + super().__init__(**kwargs) + self.some_parameter = some_parameter + + with open(tmp_path / "tmp_config.yml", "w") as tmp_file: + tmp_file.write( + f""" + version: unstable + components: + - name: custom_node + type: CustomNode + params: + base_parameter: 1 + some_parameter: value + pipelines: + - name: my_pipeline + nodes: + - name: custom_node + inputs: + - Query + """ + ) + with pytest.raises(PipelineSchemaError, match="variadic"): + Pipeline.load_from_yaml(path=tmp_path / "tmp_config.yml") + + def test_load_yaml_no_pipelines(tmp_path): with open(tmp_path / "tmp_config.yml", "w") as tmp_file: tmp_file.write( diff --git a/test/test_retriever.py b/test/test_retriever.py index 633474cff0..c31faa5353 100644 --- a/test/test_retriever.py +++ b/test/test_retriever.py @@ -192,9 +192,7 @@ def test_retribert_embedding(document_store, retriever, docs): if isinstance(document_store, WeaviateDocumentStore): # Weaviate sets the embedding dimension to 768 as soon as it is initialized. # We need 128 here and therefore initialize a new WeaviateDocumentStore. - document_store = WeaviateDocumentStore( - weaviate_url="http://localhost:8080", index="haystack_test", embedding_dim=128 - ) + document_store = WeaviateDocumentStore(index="haystack_test", embedding_dim=128) document_store.weaviate_client.schema.delete_all() document_store._create_schema_and_index_if_not_exist() document_store.return_embedding = True