From 4a83b2049dfe0bb0bd8ea65fd9d1975f498a9219 Mon Sep 17 00:00:00 2001 From: Branden Chan <33759007+brandenchan@users.noreply.github.com> Date: Mon, 28 Nov 2022 09:21:07 +0100 Subject: [PATCH] docs: Reformat code blocks in docstrings (#3580) * Fix docstrings for DocumentStores * Fix docstrings for AnswerGenerator * Fix docstrings for Connector * Fix docstrings for DocumentClassifier * Fix docstrings for LabelGenerator * Fix docstrings for QueryClassifier * Fix docstrings for Ranker * Fix docstrings for Retriever and Summarizer * Fix docstrings for Translator * Fix docstrings for Pipelines * Fix docstrings for Primitives * Fix Python code block spacing * Add line break before code block * Fix code blocks * fix: discard metadata fields if not set in Weaviate (#3578) * fix weaviate bug in returning embeddings and setting empty meta fields * review comment * Update unstable version and openapi schema (#3584) Co-authored-by: github-actions[bot] * fix: Flatten `DocumentClassifier` output in `SQLDocumentStore`; remove `_sql_session_rollback` hack in tests (#3273) * first draft * fix * fix * move test to test_sql * test: add test to check id_hash_keys is not ignored (#3577) * refactor: Generate JSON schema when missing (#3533) * removed unused script * print info logs when generating openapi schema * create json schema only when needed * fix tests * Remove leftover Co-authored-by: ZanSara * move milvus tests to their own module (#3596) * feat: store metadata using JSON in SQLDocumentStore (#3547) * add warnings * make the field cachable * review comment * Pin faiss-cpu as 1.7.3 seems to have problems (#3603) * Update Haystack imports (#3599) * Update Python version (#3602) * fix: `ParsrConverter` fails on pages without text (#3605) * try to fix bug * remove print * leftover * refactor: update Squad data (#3513) * refractor the to_squad data class * fix the validation label * refractor the to_squad data class * fix the validation label * add the test for the to_label object function * fix the tests for to_label_objects * move all the test related to squad data to one file * remove unused imports * revert tiny_augmented.json Co-authored-by: ZanSara * Url fixes (#3592) * add 2 example scripts * fixing faq script * fixing some urls * removing example scripts * black reformatting * add labeler to the repo (#3609) * convert eval metrics to python float (#3612) * feat: add support for `BM25Retriever` in `InMemoryDocumentStore` (#3561) * very first draft * implement query and query_batch * add more bm25 parameters * add rank_bm25 dependency * fix mypy * remove tokenizer callable parameter * remove unused import * only json serializable attributes * try to fix: pylint too-many-public-methods / R0904 * bm25 attribute always present * convert errors into warnings to make the tutorial 1 work * add docstrings; tests * try to make tests run * better docstrings; revert not running tests * some suggestions from review * rename elasticsearch retriever as bm25 in tests; try to test memory_bm25 * exclude tests with filters * change elasticsearch to bm25 retriever in test_summarizer * add tests * try to improve tests * better type hint * adapt test_table_text_retriever_embedding * handle non-textual docs * query only textual documents * Incorporate Reviewer feedback * refactor: replace `torch.no_grad` with `torch.inference_mode` (where possible) (#3601) * try to replace torch.no_grad * revert erroneous change * revert other module breaking * revert training/base * Fix docstrings for DocumentStores * Fix docstrings for AnswerGenerator * Fix docstrings for Connector * Fix docstrings for DocumentClassifier * Fix docstrings for LabelGenerator * Fix docstrings for QueryClassifier * Fix docstrings for Ranker * Fix docstrings for Retriever and Summarizer * Fix docstrings for Translator * Fix docstrings for Pipelines * Fix docstrings for Primitives * Fix Python code block spacing * Add line break before code block * Fix code blocks * Incorporate Reviewer feedback Co-authored-by: Massimiliano Pippi Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: github-actions[bot] Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Co-authored-by: Julian Risch Co-authored-by: ZanSara Co-authored-by: Espoir Murhabazi Co-authored-by: Tuana Celik Co-authored-by: tstadel <60758086+tstadel@users.noreply.github.com> --- haystack/document_stores/base.py | 7 + haystack/document_stores/deepsetcloud.py | 6 + haystack/document_stores/elasticsearch.py | 2 + haystack/document_stores/opensearch.py | 2 + haystack/document_stores/pinecone.py | 7 + haystack/document_stores/search_engine.py | 91 +++++---- haystack/document_stores/weaviate.py | 9 + haystack/nodes/answer_generator/base.py | 24 +-- haystack/nodes/answer_generator/openai.py | 20 +- .../nodes/answer_generator/transformers.py | 120 ++++++------ haystack/nodes/connector/crawler.py | 12 +- .../nodes/document_classifier/transformers.py | 66 +++---- haystack/nodes/extractor/entity.py | 4 + .../label_generator/pseudo_label_generator.py | 11 +- haystack/nodes/query_classifier/sklearn.py | 22 +-- .../nodes/query_classifier/transformers.py | 22 +-- haystack/nodes/ranker/base.py | 2 +- .../nodes/ranker/sentence_transformers.py | 12 +- haystack/nodes/reader/farm.py | 54 +++--- haystack/nodes/reader/transformers.py | 24 +-- haystack/nodes/retriever/base.py | 2 +- haystack/nodes/retriever/dense.py | 34 +++- haystack/nodes/retriever/sparse.py | 96 +++++----- haystack/nodes/summarizer/transformers.py | 44 ++--- haystack/nodes/translator/transformers.py | 12 +- haystack/pipelines/base.py | 172 +++++++++--------- haystack/pipelines/ray.py | 86 ++++----- haystack/pipelines/standard_pipelines.py | 6 +- haystack/schema.py | 31 ++-- haystack/utils/deepsetcloud.py | 118 ++++++------ 30 files changed, 608 insertions(+), 510 deletions(-) diff --git a/haystack/document_stores/base.py b/haystack/document_stores/base.py index 23bc1c04bd..acaa1ecffc 100644 --- a/haystack/document_stores/base.py +++ b/haystack/document_stores/base.py @@ -126,6 +126,7 @@ def get_all_documents( operation. __Example__: + ```python filters = { "$and": { @@ -175,6 +176,7 @@ def get_all_documents_generator( operation. __Example__: + ```python filters = { "$and": { @@ -255,6 +257,7 @@ def get_all_labels_aggregated( operation. __Example__: + ```python filters = { "$and": { @@ -713,6 +716,7 @@ def query( operation. __Example__: + ```python filters = { "$and": { @@ -741,6 +745,7 @@ def query( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ @@ -816,6 +821,7 @@ def query_batch( operation. __Example__: + ```python filters = { "$and": { @@ -844,6 +850,7 @@ def query_batch( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ diff --git a/haystack/document_stores/deepsetcloud.py b/haystack/document_stores/deepsetcloud.py index a17be7ab70..e9c26fdcfa 100644 --- a/haystack/document_stores/deepsetcloud.py +++ b/haystack/document_stores/deepsetcloud.py @@ -171,6 +171,7 @@ def get_all_documents( operation. __Example__: + ```python filters = { "$and": { @@ -227,6 +228,7 @@ def get_all_documents_generator( operation. __Example__: + ```python filters = { "$and": { @@ -340,6 +342,7 @@ def query_by_embedding( operation. __Example__: + ```python filters = { "$and": { @@ -368,6 +371,7 @@ def query_by_embedding( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ @@ -446,6 +450,7 @@ def query( operation. __Example__: + ```python filters = { "$and": { @@ -474,6 +479,7 @@ def query( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index 1b4f0807fd..328d811ebc 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -306,6 +306,7 @@ def query_by_embedding( operation. __Example__: + ```python filters = { "$and": { @@ -334,6 +335,7 @@ def query_by_embedding( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ diff --git a/haystack/document_stores/opensearch.py b/haystack/document_stores/opensearch.py index 39d10702b7..d8d272b47d 100644 --- a/haystack/document_stores/opensearch.py +++ b/haystack/document_stores/opensearch.py @@ -369,6 +369,7 @@ def query_by_embedding( operation. __Example__: + ```python filters = { "$and": { @@ -397,6 +398,7 @@ def query_by_embedding( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 1cca90dc68..4f6fbbfe1f 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -238,6 +238,7 @@ def get_document_count( operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. __Example__: + ```python filters = { "$and": { @@ -445,6 +446,7 @@ def update_embeddings( operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. __Example__: + ```python filters = { "$and": { @@ -538,6 +540,7 @@ def get_all_documents( operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. __Example__: + ```python filters = { "$and": { @@ -599,6 +602,7 @@ def get_all_documents_generator( operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. __Example__: + ```python filters = { "$and": { @@ -926,6 +930,7 @@ def delete_documents( operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. __Example__: + ```python filters = { "$and": { @@ -1029,6 +1034,7 @@ def query_by_embedding( operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. __Example__: + ```python filters = { "$and": { @@ -1055,6 +1061,7 @@ def query_by_embedding( To use the same logical operator multiple times on the same level, logical operators take optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ diff --git a/haystack/document_stores/search_engine.py b/haystack/document_stores/search_engine.py index d1166231de..0c0fb470c8 100644 --- a/haystack/document_stores/search_engine.py +++ b/haystack/document_stores/search_engine.py @@ -287,6 +287,7 @@ def get_metadata_values_by_key( operation. __Example__: + ```python filters = { "$and": { @@ -572,6 +573,7 @@ def get_all_documents( operation. __Example__: + ```python filters = { "$and": { @@ -623,6 +625,7 @@ def get_all_documents_generator( operation. __Example__: + ```python filters = { "$and": { @@ -728,6 +731,7 @@ def query( operation. __Example__: + ```python filters = { "$and": { @@ -756,6 +760,7 @@ def query( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ @@ -787,29 +792,29 @@ def query( :: **An example custom_query:** - ```python - | { - | "size": 10, - | "query": { - | "bool": { - | "should": [{"multi_match": { - | "query": ${query}, // mandatory query placeholder - | "type": "most_fields", - | "fields": ["content", "title"]}}], - | "filter": [ // optional custom filters - | {"terms": {"year": ${years}}}, - | {"terms": {"quarter": ${quarters}}}, - | {"range": {"date": {"gte": ${date}}}} - | ], - | } - | }, - | } + ```python + { + "size": 10, + "query": { + "bool": { + "should": [{"multi_match": { + "query": ${query}, // mandatory query placeholder + "type": "most_fields", + "fields": ["content", "title"]}}], + "filter": [ // optional custom filters + {"terms": {"year": ${years}}}, + {"terms": {"quarter": ${quarters}}}, + {"range": {"date": {"gte": ${date}}}} + ], + } + }, + } ``` **For this custom_query, a sample retrieve() could be:** ```python - | self.retrieve(query="Why did the revenue increase?", - | filters={"years": ["2019"], "quarters": ["Q1", "Q2"]}) + self.retrieve(query="Why did the revenue increase?", + filters={"years": ["2019"], "quarters": ["Q1", "Q2"]}) ``` Optionally, highlighting can be defined by specifying the highlight settings. @@ -818,31 +823,31 @@ def query( :: **Example custom_query with highlighting:** - ```python - | { - | "size": 10, - | "query": { - | "bool": { - | "should": [{"multi_match": { - | "query": ${query}, // mandatory query placeholder - | "type": "most_fields", - | "fields": ["content", "title"]}}], - | } - | }, - | "highlight": { // enable highlighting - | "fields": { // for fields content and title - | "content": {}, - | "title": {} - | } - | }, - | } + ```python + { + "size": 10, + "query": { + "bool": { + "should": [{"multi_match": { + "query": ${query}, // mandatory query placeholder + "type": "most_fields", + "fields": ["content", "title"]}}], + } + }, + "highlight": { // enable highlighting + "fields": { // for fields content and title + "content": {}, + "title": {} + } + }, + } ``` **For this custom_query, highlighting info can be accessed by:** ```python - | docs = self.retrieve(query="Why did the revenue increase?") - | highlighted_content = docs[0].meta["highlighted"]["content"] - | highlighted_title = docs[0].meta["highlighted"]["title"] + docs = self.retrieve(query="Why did the revenue increase?") + highlighted_content = docs[0].meta["highlighted"]["content"] + highlighted_title = docs[0].meta["highlighted"]["title"] ``` :param index: The name of the index in the DocumentStore from which to retrieve documents @@ -915,6 +920,7 @@ def query_batch( operation. __Example__: + ```python filters = { "$and": { @@ -943,6 +949,7 @@ def query_batch( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ @@ -1166,6 +1173,7 @@ def update_embeddings( operation. __Example__: + ```python filters = { "$and": { @@ -1270,6 +1278,7 @@ def delete_all_documents( operation. __Example__: + ```python filters = { "$and": { @@ -1320,6 +1329,7 @@ def delete_documents( operation. __Example__: + ```python filters = { "$and": { @@ -1383,6 +1393,7 @@ def delete_labels( operation. __Example__: + ```python filters = { "$and": { diff --git a/haystack/document_stores/weaviate.py b/haystack/document_stores/weaviate.py index 1c9ff18306..9912c3f8a2 100644 --- a/haystack/document_stores/weaviate.py +++ b/haystack/document_stores/weaviate.py @@ -674,6 +674,7 @@ def get_all_documents( operation. __Example__: + ```python filters = { "$and": { @@ -818,6 +819,7 @@ def get_all_documents_generator( operation. __Example__: + ```python filters = { "$and": { @@ -876,6 +878,7 @@ def query( operation. __Example__: + ```python filters = { "$and": { @@ -904,6 +907,7 @@ def query( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ @@ -1058,6 +1062,7 @@ def query_by_embedding( operation. __Example__: + ```python filters = { "$and": { @@ -1086,6 +1091,7 @@ def query_by_embedding( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ @@ -1196,6 +1202,7 @@ def update_embeddings( operation. __Example__: + ```python filters = { "$and": { @@ -1267,6 +1274,7 @@ def delete_all_documents( operation. __Example__: + ```python filters = { "$and": { @@ -1319,6 +1327,7 @@ def delete_documents( operation. __Example__: + ```python filters = { "$and": { diff --git a/haystack/nodes/answer_generator/base.py b/haystack/nodes/answer_generator/base.py index 066b55050c..3d9fdde3fd 100644 --- a/haystack/nodes/answer_generator/base.py +++ b/haystack/nodes/answer_generator/base.py @@ -113,18 +113,18 @@ def predict_batch( :param batch_size: Not applicable. :return: Generated answers plus additional infos in a dict like this: - ```python - | {'queries': 'who got the first nobel prize in physics', - | 'answers': - | [{'query': 'who got the first nobel prize in physics', - | 'answer': ' albert einstein', - | 'meta': { 'doc_ids': [...], - | 'doc_scores': [80.42758 ...], - | 'doc_probabilities': [40.71379089355469, ... - | 'content': ['Albert Einstein was a ...] - | 'titles': ['"Albert Einstein"', ...] - | }}]} - ``` + ```python + {'queries': 'who got the first nobel prize in physics', + 'answers': + [{'query': 'who got the first nobel prize in physics', + 'answer': ' albert einstein', + 'meta': { 'doc_ids': [...], + 'doc_scores': [80.42758 ...], + 'doc_probabilities': [40.71379089355469, ... + 'content': ['Albert Einstein was a ...] + 'titles': ['"Albert Einstein"', ...] + }}]} + ``` """ # TODO: This method currently just calls the predict method multiple times, so there is room for improvement. diff --git a/haystack/nodes/answer_generator/openai.py b/haystack/nodes/answer_generator/openai.py index 44bc3063a1..f6781aa08f 100644 --- a/haystack/nodes/answer_generator/openai.py +++ b/haystack/nodes/answer_generator/openai.py @@ -103,16 +103,16 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] = Note that OpenAI doesn't return scores for those Answers. Example: - ```python - |{ - | 'query': 'Who is the father of Arya Stark?', - | 'answers':[Answer( - | 'answer': 'Eddard,', - | 'score': None, - | ),... - | ] - |} - ``` + ```python + { + 'query': 'Who is the father of Arya Stark?', + 'answers':[Answer( + 'answer': 'Eddard,', + 'score': None, + ),... + ] + } + ``` :param query: The query you want to provide. It's a string. :param documents: List of Documents in which to search for the Answer. diff --git a/haystack/nodes/answer_generator/transformers.py b/haystack/nodes/answer_generator/transformers.py index 4fe6ff3057..298113a095 100644 --- a/haystack/nodes/answer_generator/transformers.py +++ b/haystack/nodes/answer_generator/transformers.py @@ -37,31 +37,31 @@ class RAGenerator(BaseGenerator): **Example** - ```python - | query = "who got the first nobel prize in physics?" - | - | # Retrieve related documents from retriever - | retrieved_docs = retriever.retrieve(query=query) - | - | # Now generate answer from query and retrieved documents - | generator.predict( - | query=query, - | documents=retrieved_docs, - | top_k=1 - | ) - | - | # Answer - | - | {'query': 'who got the first nobel prize in physics', - | 'answers': - | [{'query': 'who got the first nobel prize in physics', - | 'answer': ' albert einstein', - | 'meta': { 'doc_ids': [...], - | 'doc_scores': [80.42758 ...], - | 'doc_probabilities': [40.71379089355469, ... - | 'content': ['Albert Einstein was a ...] - | 'titles': ['"Albert Einstein"', ...] - | }}]} + ```python + query = "who got the first nobel prize in physics?" + + # Retrieve related documents from retriever + retrieved_docs = retriever.retrieve(query=query) + + # Now generate answer from query and retrieved documents + generator.predict( + query=query, + documents=retrieved_docs, + top_k=1 + ) + + # Answer + + {'query': 'who got the first nobel prize in physics', + 'answers': + [{'query': 'who got the first nobel prize in physics', + 'answer': ' albert einstein', + 'meta': { 'doc_ids': [...], + 'doc_scores': [80.42758 ...], + 'doc_probabilities': [40.71379089355469, ... + 'content': ['Albert Einstein was a ...] + 'titles': ['"Albert Einstein"', ...] + }}]} ``` """ @@ -218,16 +218,16 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] = :return: Generated answers plus additional infos in a dict like this: ```python - | {'query': 'who got the first nobel prize in physics', - | 'answers': - | [{'query': 'who got the first nobel prize in physics', - | 'answer': ' albert einstein', - | 'meta': { 'doc_ids': [...], - | 'doc_scores': [80.42758 ...], - | 'doc_probabilities': [40.71379089355469, ... - | 'content': ['Albert Einstein was a ...] - | 'titles': ['"Albert Einstein"', ...] - | }}]} + {'query': 'who got the first nobel prize in physics', + 'answers': + [{'query': 'who got the first nobel prize in physics', + 'answer': ' albert einstein', + 'meta': { 'doc_ids': [...], + 'doc_scores': [80.42758 ...], + 'doc_probabilities': [40.71379089355469, ... + 'content': ['Albert Einstein was a ...] + 'titles': ['"Albert Einstein"', ...] + }}]} ``` """ torch.set_grad_enabled(False) @@ -308,31 +308,31 @@ class Seq2SeqGenerator(BaseGenerator): **Example** - ```python - | query = "Why is Dothraki language important?" - | - | # Retrieve related documents from retriever - | retrieved_docs = retriever.retrieve(query=query) - | - | # Now generate answer from query and retrieved documents - | generator.predict( - | query=query, - | documents=retrieved_docs, - | top_k=1 - | ) - | - | # Answer - | - | {'query': 'who got the first nobel prize in physics', - | 'answers': - | [{'query': 'who got the first nobel prize in physics', - | 'answer': ' albert einstein', - | 'meta': { 'doc_ids': [...], - | 'doc_scores': [80.42758 ...], - | 'doc_probabilities': [40.71379089355469, ... - | 'content': ['Albert Einstein was a ...] - | 'titles': ['"Albert Einstein"', ...] - | }}]} + ```python + query = "Why is Dothraki language important?" + + # Retrieve related documents from retriever + retrieved_docs = retriever.retrieve(query=query) + + # Now generate answer from query and retrieved documents + generator.predict( + query=query, + documents=retrieved_docs, + top_k=1 + ) + + # Answer + + {'query': 'who got the first nobel prize in physics', + 'answers': + [{'query': 'who got the first nobel prize in physics', + 'answer': ' albert einstein', + 'meta': { 'doc_ids': [...], + 'doc_scores': [80.42758 ...], + 'doc_probabilities': [40.71379089355469, ... + 'content': ['Albert Einstein was a ...] + 'titles': ['"Albert Einstein"', ...] + }}]} ``` """ diff --git a/haystack/nodes/connector/crawler.py b/haystack/nodes/connector/crawler.py index bf14a180a3..4ae3fe64db 100644 --- a/haystack/nodes/connector/crawler.py +++ b/haystack/nodes/connector/crawler.py @@ -36,12 +36,12 @@ class Crawler(BaseComponent): **Example:** ```python - | from haystack.nodes.connector import Crawler - | - | crawler = Crawler(output_dir="crawled_files") - | # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/overview/ - | docs = crawler.crawl(urls=["https://haystack.deepset.ai/overview/get-started"], - | filter_urls= ["haystack.deepset.ai/overview/"]) + from haystack.nodes.connector import Crawler + + crawler = Crawler(output_dir="crawled_files") + # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/overview/ + docs = crawler.crawl(urls=["https://haystack.deepset.ai/overview/get-started"], + filter_urls= ["haystack.deepset.ai/overview/"]) ``` """ diff --git a/haystack/nodes/document_classifier/transformers.py b/haystack/nodes/document_classifier/transformers.py index cfc575e0fe..4d03af925f 100644 --- a/haystack/nodes/document_classifier/transformers.py +++ b/haystack/nodes/document_classifier/transformers.py @@ -28,40 +28,42 @@ class TransformersDocumentClassifier(BaseDocumentClassifier): With this document_classifier, you can directly get predictions via predict() - **Usage example at query time:** - ```python - | ... - | retriever = BM25Retriever(document_store=document_store) - | document_classifier = TransformersDocumentClassifier(model_name_or_path="bhadresh-savani/distilbert-base-uncased-emotion") - | p = Pipeline() - | p.add_node(component=retriever, name="Retriever", inputs=["Query"]) - | p.add_node(component=document_classifier, name="Classifier", inputs=["Retriever"]) - | res = p.run( - | query="Who is the father of Arya Stark?", - | params={"Retriever": {"top_k": 10}} - | ) - | - | # print the classification results - | print_documents(res, max_text_len=100, print_meta=True) - | # or access the predicted class label directly - | res["documents"][0].to_dict()["meta"]["classification"]["label"] - ``` + **Usage example at query time:** + + ```python + ... + retriever = BM25Retriever(document_store=document_store) + document_classifier = TransformersDocumentClassifier(model_name_or_path="bhadresh-savani/distilbert-base-uncased-emotion") + p = Pipeline() + p.add_node(component=retriever, name="Retriever", inputs=["Query"]) + p.add_node(component=document_classifier, name="Classifier", inputs=["Retriever"]) + res = p.run( + query="Who is the father of Arya Stark?", + params={"Retriever": {"top_k": 10}} + ) + + # print the classification results + print_documents(res, max_text_len=100, print_meta=True) + # or access the predicted class label directly + res["documents"][0].to_dict()["meta"]["classification"]["label"] + ``` **Usage example at index time:** - ```python - | ... - | converter = TextConverter() - | preprocessor = Preprocessor() - | document_store = ElasticsearchDocumentStore() - | document_classifier = TransformersDocumentClassifier(model_name_or_path="bhadresh-savani/distilbert-base-uncased-emotion", - | batch_size=16) - | p = Pipeline() - | p.add_node(component=converter, name="TextConverter", inputs=["File"]) - | p.add_node(component=preprocessor, name="Preprocessor", inputs=["TextConverter"]) - | p.add_node(component=document_classifier, name="DocumentClassifier", inputs=["Preprocessor"]) - | p.add_node(component=document_store, name="DocumentStore", inputs=["DocumentClassifier"]) - | p.run(file_paths=file_paths) - ``` + + ```python + ... + converter = TextConverter() + preprocessor = Preprocessor() + document_store = ElasticsearchDocumentStore() + document_classifier = TransformersDocumentClassifier(model_name_or_path="bhadresh-savani/distilbert-base-uncased-emotion", + batch_size=16) + p = Pipeline() + p.add_node(component=converter, name="TextConverter", inputs=["File"]) + p.add_node(component=preprocessor, name="Preprocessor", inputs=["TextConverter"]) + p.add_node(component=document_classifier, name="DocumentClassifier", inputs=["Preprocessor"]) + p.add_node(component=document_store, name="DocumentStore", inputs=["DocumentClassifier"]) + p.run(file_paths=file_paths) + ``` """ def __init__( diff --git a/haystack/nodes/extractor/entity.py b/haystack/nodes/extractor/entity.py index 743e6ce6bc..7f1ce3d8d1 100644 --- a/haystack/nodes/extractor/entity.py +++ b/haystack/nodes/extractor/entity.py @@ -484,12 +484,16 @@ def simplify_ner_for_qa(output): """ Returns a simplified version of the output dictionary with the following structure: + + ```python [ { answer: { ... } entities: [ { ... }, {} ] } ] + ``` + The entities included are only the ones that overlap with the answer itself. diff --git a/haystack/nodes/label_generator/pseudo_label_generator.py b/haystack/nodes/label_generator/pseudo_label_generator.py index 190414d140..783ae9de1d 100644 --- a/haystack/nodes/label_generator/pseudo_label_generator.py +++ b/haystack/nodes/label_generator/pseudo_label_generator.py @@ -29,12 +29,11 @@ class PseudoLabelGenerator(BaseComponent): For example: ```python - | document_store = DocumentStore(...) - | retriever = Retriever(...) - | qg = QuestionGenerator(model_name_or_path="doc2query/msmarco-t5-base-v1") - | plg = PseudoLabelGenerator(qg, retriever) - | output, output_id = psg.run(documents=document_store.get_all_documents()) - | + document_store = ElasticsearchDocumentStore(...) + retriever = BM25Retriever(...) + qg = QuestionGenerator(model_name_or_path="doc2query/msmarco-t5-base-v1") + plg = PseudoLabelGenerator(qg, retriever) + output, output_id = psg.run(documents=document_store.get_all_documents()) ``` Note: diff --git a/haystack/nodes/query_classifier/sklearn.py b/haystack/nodes/query_classifier/sklearn.py index 078eb33330..c2d48ef1af 100644 --- a/haystack/nodes/query_classifier/sklearn.py +++ b/haystack/nodes/query_classifier/sklearn.py @@ -18,20 +18,20 @@ class SklearnQueryClassifier(BaseQueryClassifier): and the further processing can be customized. You can define this by connecting the further pipeline to either `output_1` or `output_2` from this node. Example: - ```python - |{ - |pipe = Pipeline() - |pipe.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"]) - |pipe.add_node(component=elastic_retriever, name="ElasticRetriever", inputs=["QueryClassifier.output_2"]) - |pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) - |# Keyword queries will use the ElasticRetriever - |pipe.run("kubernetes aws") + ```python + pipe = Pipeline() + pipe.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"]) + pipe.add_node(component=bm25_retriever, name="BM25Retriever", inputs=["QueryClassifier.output_2"]) + pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) - |# Semantic queries (questions, statements, sentences ...) will leverage the DPR retriever - |pipe.run("How to manage kubernetes on aws") + # Keyword queries will use the BM25Retriever + pipe.run("kubernetes aws") - ``` + # Semantic queries (questions, statements, sentences ...) will leverage the DPR retriever + pipe.run("How to manage kubernetes on aws") + + ``` Models: diff --git a/haystack/nodes/query_classifier/transformers.py b/haystack/nodes/query_classifier/transformers.py index 50cc7d4991..16e0cc97a2 100644 --- a/haystack/nodes/query_classifier/transformers.py +++ b/haystack/nodes/query_classifier/transformers.py @@ -24,20 +24,20 @@ class TransformersQueryClassifier(BaseQueryClassifier): This node also supports zero-shot-classification. Example: - ```python - |{ - |pipe = Pipeline() - |pipe.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"]) - |pipe.add_node(component=elastic_retriever, name="ElasticRetriever", inputs=["QueryClassifier.output_2"]) - |pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) + ```python + { + pipe = Pipeline() + pipe.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"]) + pipe.add_node(component=bm25_retriever, name="BM25Retriever", inputs=["QueryClassifier.output_2"]) + pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) - |# Keyword queries will use the ElasticRetriever - |pipe.run("kubernetes aws") + # Keyword queries will use the BM25Retriever + pipe.run("kubernetes aws") - |# Semantic queries (questions, statements, sentences ...) will leverage the DPR retriever - |pipe.run("How to manage kubernetes on aws") + # Semantic queries (questions, statements, sentences ...) will leverage the DPR retriever + pipe.run("How to manage kubernetes on aws") - ``` + ``` Models: diff --git a/haystack/nodes/ranker/base.py b/haystack/nodes/ranker/base.py index d1e1b1bd1c..313bf047ba 100644 --- a/haystack/nodes/ranker/base.py +++ b/haystack/nodes/ranker/base.py @@ -104,7 +104,7 @@ def eval( Ranker is evaluated in the same way as a Retriever based on whether it finds the correct document given the query string and at which position in the ranking of documents the correct document is. - | Returns a dict containing the following metrics: + Returns a dict containing the following metrics: - "recall": Proportion of questions for which correct document is among retrieved documents - "mrr": Mean of reciprocal rank. Rewards retrievers that give relevant documents a higher rank. diff --git a/haystack/nodes/ranker/sentence_transformers.py b/haystack/nodes/ranker/sentence_transformers.py index bc34026751..5b289871f3 100644 --- a/haystack/nodes/ranker/sentence_transformers.py +++ b/haystack/nodes/ranker/sentence_transformers.py @@ -25,17 +25,17 @@ class SentenceTransformersRanker(BaseRanker): - use two output logits (no_answer, has_answer) e.g. deepset/gbert-base-germandpr-reranking https://www.sbert.net/docs/pretrained-models/ce-msmarco.html#usage-with-transformers - | With a SentenceTransformersRanker, you can: + With a SentenceTransformersRanker, you can: - directly get predictions via predict() Usage example: ```python - | retriever = BM25Retriever(document_store=document_store) - | ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2") - | p = Pipeline() - | p.add_node(component=retriever, name="ESRetriever", inputs=["Query"]) - | p.add_node(component=ranker, name="Ranker", inputs=["ESRetriever"]) + retriever = BM25Retriever(document_store=document_store) + ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2") + p = Pipeline() + p.add_node(component=retriever, name="Retriever", inputs=["Query"]) + p.add_node(component=ranker, name="Ranker", inputs=["ESRetriever"]) ``` """ diff --git a/haystack/nodes/reader/farm.py b/haystack/nodes/reader/farm.py index eb577cf243..b04cf76de8 100644 --- a/haystack/nodes/reader/farm.py +++ b/haystack/nodes/reader/farm.py @@ -38,7 +38,7 @@ class FARMReader(BaseReader): Transformer based model for extractive Question Answering using the FARM framework (https://github.com/deepset-ai/FARM). While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interface remains the same. - | With a FARMReader, you can: + With a FARMReader, you can: - directly get predictions via predict() - fine-tune the model on QA data via train() @@ -861,19 +861,20 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] = Returns dictionaries containing answers sorted by (desc.) score. Example: - ```python - |{ - | 'query': 'Who is the father of Arya Stark?', - | 'answers':[Answer( - | 'answer': 'Eddard,', - | 'context': "She travels with her father, Eddard, to King's Landing when he is", - | 'score': 0.9787139466668613, - | 'offsets_in_context': [Span(start=29, end=35], - | 'offsets_in_context': [Span(start=347, end=353], - | 'document_id': '88d1ed769d003939d3a0d28034464ab2' - | ),... - | ] - |} + + ```python + { + 'query': 'Who is the father of Arya Stark?', + 'answers':[Answer( + 'answer': 'Eddard,', + 'context': "She travels with her father, Eddard, to King's Landing when he is", + 'score': 0.9787139466668613, + 'offsets_in_context': [Span(start=29, end=35], + 'offsets_in_context': [Span(start=347, end=353], + 'document_id': '88d1ed769d003939d3a0d28034464ab2' + ),... + ] + } ``` :param query: Query string @@ -1280,19 +1281,20 @@ def predict_on_texts(self, question: str, texts: List[str], top_k: Optional[int] Use loaded QA model to find answers for a question in the supplied list of Document. Returns dictionaries containing answers sorted by (desc.) score. Example: + ```python - |{ - | 'question': 'Who is the father of Arya Stark?', - | 'answers':[ - | {'answer': 'Eddard,', - | 'context': " She travels with her father, Eddard, to King's Landing when he is ", - | 'offset_answer_start': 147, - | 'offset_answer_end': 154, - | 'score': 0.9787139466668613, - | 'document_id': '1337' - | },... - | ] - |} + { + 'question': 'Who is the father of Arya Stark?', + 'answers':[ + {'answer': 'Eddard,', + 'context': " She travels with her father, Eddard, to King's Landing when he is ", + 'offset_answer_start': 147, + 'offset_answer_end': 154, + 'score': 0.9787139466668613, + 'document_id': '1337' + },... + ] + } ``` :param question: Question string diff --git a/haystack/nodes/reader/transformers.py b/haystack/nodes/reader/transformers.py index cc842be85b..ab0e4509cb 100644 --- a/haystack/nodes/reader/transformers.py +++ b/haystack/nodes/reader/transformers.py @@ -116,18 +116,18 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] = Example: ```python - |{ - | 'query': 'Who is the father of Arya Stark?', - | 'answers':[ - | {'answer': 'Eddard,', - | 'context': " She travels with her father, Eddard, to King's Landing when he is ", - | 'offset_answer_start': 147, - | 'offset_answer_end': 154, - | 'score': 0.9787139466668613, - | 'document_id': '1337' - | },... - | ] - |} + { + 'query': 'Who is the father of Arya Stark?', + 'answers':[ + {'answer': 'Eddard,', + 'context': " She travels with her father, Eddard, to King's Landing when he is ", + 'offset_answer_start': 147, + 'offset_answer_end': 154, + 'score': 0.9787139466668613, + 'document_id': '1337' + },... + ] + } ``` :param query: Query string diff --git a/haystack/nodes/retriever/base.py b/haystack/nodes/retriever/base.py index 22a3b4b643..95ee2ef529 100644 --- a/haystack/nodes/retriever/base.py +++ b/haystack/nodes/retriever/base.py @@ -131,7 +131,7 @@ def eval( Retriever is evaluated based on whether it finds the correct document given the query string and at which position in the ranking of documents the correct document is. - | Returns a dict containing the following metrics: + Returns a dict containing the following metrics: - "recall": Proportion of questions for which correct document is among retrieved documents - "mrr": Mean of reciprocal rank. Rewards retrievers that give relevant documents a higher rank. diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index c89834668e..bb0791b2c5 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -110,16 +110,16 @@ def __init__( **Example:** - ```python - | # remote model from FAIR - | DensePassageRetriever(document_store=your_doc_store, - | query_embedding_model="facebook/dpr-question_encoder-single-nq-base", - | passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base") - | # or from local path - | DensePassageRetriever(document_store=your_doc_store, - | query_embedding_model="model_directory/question-encoder", - | passage_embedding_model="model_directory/context-encoder") - ``` + ```python + # remote model from FAIR + DensePassageRetriever(document_store=your_doc_store, + query_embedding_model="facebook/dpr-question_encoder-single-nq-base", + passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base") + # or from local path + DensePassageRetriever(document_store=your_doc_store, + query_embedding_model="model_directory/question-encoder", + passage_embedding_model="model_directory/context-encoder") + ``` :param document_store: An instance of DocumentStore from which to retrieve documents. :param query_embedding_model: Local path or remote name of question encoder checkpoint. The format equals the @@ -266,6 +266,7 @@ def retrieve( operation. __Example__: + ```python filters = { "$and": { @@ -294,6 +295,7 @@ def retrieve( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ @@ -378,6 +380,7 @@ def retrieve_batch( operation. __Example__: + ```python filters = { "$and": { @@ -406,6 +409,7 @@ def retrieve_batch( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ @@ -1020,6 +1024,7 @@ def retrieve_batch( operation. __Example__: + ```python filters = { "$and": { @@ -1048,6 +1053,7 @@ def retrieve_batch( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ @@ -1617,6 +1623,7 @@ def retrieve( operation. __Example__: + ```python filters = { "$and": { @@ -1645,6 +1652,7 @@ def retrieve( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ @@ -1729,6 +1737,7 @@ def retrieve_batch( operation. __Example__: + ```python filters = { "$and": { @@ -1757,6 +1766,7 @@ def retrieve_batch( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ @@ -2086,6 +2096,7 @@ def retrieve( operation. __Example__: + ```python filters = { "$and": { @@ -2114,6 +2125,7 @@ def retrieve( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ @@ -2192,6 +2204,7 @@ def retrieve_batch( operation. __Example__: + ```python filters = { "$and": { @@ -2220,6 +2233,7 @@ def retrieve_batch( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ diff --git a/haystack/nodes/retriever/sparse.py b/haystack/nodes/retriever/sparse.py index 62f6f4554f..dfdb0d3831 100644 --- a/haystack/nodes/retriever/sparse.py +++ b/haystack/nodes/retriever/sparse.py @@ -37,65 +37,67 @@ def __init__( Optionally, ES `filter` clause can be added where the values of `terms` are placeholders that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..) names must match with the filters dict supplied in self.retrieve(). - :: - **An example custom_query:** - ```python - | { - | "size": 10, - | "query": { - | "bool": { - | "should": [{"multi_match": { - | "query": ${query}, // mandatory query placeholder - | "type": "most_fields", - | "fields": ["content", "title"]}}], - | "filter": [ // optional custom filters - | {"terms": {"year": ${years}}}, - | {"terms": {"quarter": ${quarters}}}, - | {"range": {"date": {"gte": ${date}}}} - | ], - | } - | }, - | } - ``` + **An example custom_query:** + + ```python + { + "size": 10, + "query": { + "bool": { + "should": [{"multi_match": { + "query": ${query}, // mandatory query placeholder + "type": "most_fields", + "fields": ["content", "title"]}}], + "filter": [ // optional custom filters + {"terms": {"year": ${years}}}, + {"terms": {"quarter": ${quarters}}}, + {"range": {"date": {"gte": ${date}}}} + ], + } + }, + } + ``` - **For this custom_query, a sample retrieve() could be:** - ```python - | self.retrieve(query="Why did the revenue increase?", - | filters={"years": ["2019"], "quarters": ["Q1", "Q2"]}) + **For this custom_query, a sample retrieve() could be:** + + ```python + self.retrieve(query="Why did the revenue increase?", + filters={"years": ["2019"], "quarters": ["Q1", "Q2"]}) ``` Optionally, highlighting can be defined by specifying Elasticsearch's highlight settings. See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html. You will find the highlighted output in the returned Document's meta field by key "highlighted". - :: + **Example custom_query with highlighting:** + ```python - | { - | "size": 10, - | "query": { - | "bool": { - | "should": [{"multi_match": { - | "query": ${query}, // mandatory query placeholder - | "type": "most_fields", - | "fields": ["content", "title"]}}], - | } - | }, - | "highlight": { // enable highlighting - | "fields": { // for fields content and title - | "content": {}, - | "title": {} - | } - | }, - | } + { + "size": 10, + "query": { + "bool": { + "should": [{"multi_match": { + "query": ${query}, // mandatory query placeholder + "type": "most_fields", + "fields": ["content", "title"]}}], + } + }, + "highlight": { // enable highlighting + "fields": { // for fields content and title + "content": {}, + "title": {} + } + }, + } ``` **For this custom_query, highlighting info can be accessed by:** ```python - | docs = self.retrieve(query="Why did the revenue increase?") - | highlighted_content = docs[0].meta["highlighted"]["content"] - | highlighted_title = docs[0].meta["highlighted"]["title"] + docs = self.retrieve(query="Why did the revenue increase?") + highlighted_content = docs[0].meta["highlighted"]["content"] + highlighted_title = docs[0].meta["highlighted"]["title"] ``` :param top_k: How many documents to return per query. @@ -139,6 +141,7 @@ def retrieve( operation. __Example__: + ```python filters = { "$and": { @@ -167,6 +170,7 @@ def retrieve( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ @@ -261,6 +265,7 @@ def retrieve_batch( operation. __Example__: + ```python filters = { "$and": { @@ -289,6 +294,7 @@ def retrieve_batch( optionally a list of dictionaries as value. __Example__: + ```python filters = { "$or": [ diff --git a/haystack/nodes/summarizer/transformers.py b/haystack/nodes/summarizer/transformers.py index e3043bac75..ab6bc45456 100644 --- a/haystack/nodes/summarizer/transformers.py +++ b/haystack/nodes/summarizer/transformers.py @@ -26,28 +26,28 @@ class TransformersSummarizer(BaseSummarizer): **Example** - ```python - | docs = [Document(content="PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions." - | "The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by" - | "the shutoffs which were expected to last through at least midday tomorrow.")] - | - | # Summarize - | summary = summarizer.predict( - | documents=docs) - | - | # Show results (List of Documents, containing summary and original content) - | print(summary) - | - | [ - | { - | "content": "PGE stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. ...", - | ... - | "meta": { - | "summary": "California's largest electricity provider has turned off power to hundreds of thousands of customers.", - | ... - | }, - | ... - | }, + ```python + docs = [Document(content="PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions." + "The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by" + "the shutoffs which were expected to last through at least midday tomorrow.")] + + # Summarize + summary = summarizer.predict( + documents=docs) + + # Show results (List of Documents, containing summary and original content) + print(summary) + + [ + { + "content": "PGE stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. ...", + ... + "meta": { + "summary": "California's largest electricity provider has turned off power to hundreds of thousands of customers.", + ... + }, + ... + }, ``` """ diff --git a/haystack/nodes/translator/transformers.py b/haystack/nodes/translator/transformers.py index 69a9f3aaa8..e896114749 100644 --- a/haystack/nodes/translator/transformers.py +++ b/haystack/nodes/translator/transformers.py @@ -27,12 +27,12 @@ class TransformersTranslator(BaseTranslator): **Example:** ```python - | DOCS = [ - | Document(content="Heinz von Foerster was an Austrian American scientist combining physics and philosophy, - | and widely attributed as the originator of Second-order cybernetics.") - | ] - | translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-de") - | res = translator.translate(documents=DOCS, query=None) + DOCS = [ + Document(content="Heinz von Foerster was an Austrian American scientist combining physics and philosophy, + and widely attributed as the originator of Second-order cybernetics.") + ] + translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-de") + res = translator.translate(documents=DOCS, query=None) ``` """ diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index a210d95087..ed7d4692f6 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -242,15 +242,18 @@ def list_pipelines_on_deepset_cloud( "..." -> additional pipeline meta information } example: - [{'name': 'my_super_nice_pipeline_config', - 'pipeline_id': '2184e0c1-c6ec-40a1-9b28-5d2768e5efa2', - 'status': 'DEPLOYED', - 'created_at': '2022-02-01T09:57:03.803991+00:00', - 'deleted': False, - 'is_default': False, - 'indexing': {'status': 'IN_PROGRESS', - 'pending_file_count': 3, - 'total_file_count': 31}}] + + ```python + [{'name': 'my_super_nice_pipeline_config', + 'pipeline_id': '2184e0c1-c6ec-40a1-9b28-5d2768e5efa2', + 'status': 'DEPLOYED', + 'created_at': '2022-02-01T09:57:03.803991+00:00', + 'deleted': False, + 'is_default': False, + 'indexing': {'status': 'IN_PROGRESS', + 'pending_file_count': 3, + 'total_file_count': 31}}] + ``` """ client = DeepsetCloud.get_pipeline_client(api_key=api_key, api_endpoint=api_endpoint, workspace=workspace) pipeline_config_infos = list(client.list_pipeline_configs()) @@ -891,22 +894,22 @@ def execute_eval_run( E.g. you can call execute_eval_run() multiple times with different retrievers in your query pipeline and compare the runs in mlflow: ```python - | for retriever_type, query_pipeline in zip(["sparse", "dpr", "embedding"], [sparse_pipe, dpr_pipe, embedding_pipe]): - | eval_result = Pipeline.execute_eval_run( - | index_pipeline=index_pipeline, - | query_pipeline=query_pipeline, - | evaluation_set_labels=labels, - | corpus_file_paths=file_paths, - | corpus_file_metas=file_metas, - | experiment_tracking_tool="mlflow", - | experiment_tracking_uri="http://localhost:5000", - | experiment_name="my-retriever-experiment", - | experiment_run_name=f"run_{retriever_type}", - | pipeline_meta={"name": f"my-pipeline-{retriever_type}"}, - | evaluation_set_meta={"name": "my-evalset"}, - | corpus_meta={"name": "my-corpus"}. - | reuse_index=False - | ) + for retriever_type, query_pipeline in zip(["sparse", "dpr", "embedding"], [sparse_pipe, dpr_pipe, embedding_pipe]): + eval_result = Pipeline.execute_eval_run( + index_pipeline=index_pipeline, + query_pipeline=query_pipeline, + evaluation_set_labels=labels, + corpus_file_paths=file_paths, + corpus_file_metas=file_metas, + experiment_tracking_tool="mlflow", + experiment_tracking_uri="http://localhost:5000", + experiment_name="my-retriever-experiment", + experiment_run_name=f"run_{retriever_type}", + pipeline_meta={"name": f"my-pipeline-{retriever_type}"}, + evaluation_set_meta={"name": "my-evalset"}, + corpus_meta={"name": "my-corpus"}. + reuse_index=False + ) ``` :param index_pipeline: The indexing pipeline to use. @@ -1777,9 +1780,12 @@ def get_nodes_by_class(self, class_type) -> List[Any]: Gets all nodes in the pipeline that are an instance of a certain class (incl. subclasses). This is for example helpful if you loaded a pipeline and then want to interact directly with the document store. Example: - | from haystack.document_stores.base import BaseDocumentStore - | INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME) - | res = INDEXING_PIPELINE.get_nodes_by_class(class_type=BaseDocumentStore) + + ``` python + from haystack.document_stores.base import BaseDocumentStore + INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME) + res = INDEXING_PIPELINE.get_nodes_by_class(class_type=BaseDocumentStore) + ``` :return: List of components that are an instance the requested class """ @@ -1844,31 +1850,31 @@ def load_from_yaml( Here's a sample configuration: - ```yaml - | version: '1.9.0' - | - | components: # define all the building-blocks for Pipeline - | - name: MyReader # custom-name for the component; helpful for visualization & debugging - | type: FARMReader # Haystack Class name for the component - | params: - | model_name_or_path: deepset/roberta-base-squad2 - | - name: MyESRetriever - | type: BM25Retriever - | params: - | document_store: MyDocumentStore # params can reference other components defined in the YAML - | - name: MyDocumentStore - | type: ElasticsearchDocumentStore - | params: - | index: haystack_test - | - | pipelines: # multiple Pipelines can be defined using the components from above - | - name: my_query_pipeline # a simple extractive-qa Pipeline - | nodes: - | - name: MyESRetriever - | inputs: [Query] - | - name: MyReader - | inputs: [MyESRetriever] - ``` + ```yaml + version: '1.9.0' + + components: # define all the building-blocks for Pipeline + - name: MyReader # custom-name for the component; helpful for visualization & debugging + type: FARMReader # Haystack Class name for the component + params: + model_name_or_path: deepset/roberta-base-squad2 + - name: MyRetriever + type: BM25Retriever + params: + document_store: MyDocumentStore # params can reference other components defined in the YAML + - name: MyDocumentStore + type: ElasticsearchDocumentStore + params: + index: haystack_test + + pipelines: # multiple Pipelines can be defined using the components from above + - name: my_query_pipeline # a simple extractive-qa Pipeline + nodes: + - name: MyRetriever + inputs: [Query] + - name: MyReader + inputs: [MyRetriever] + ``` Note that, in case of a mismatch in version between Haystack and the YAML, a warning will be printed. If the pipeline loads correctly regardless, save again the pipeline using `Pipeline.save_to_yaml()` to remove the warning. @@ -1905,36 +1911,36 @@ def load_from_config( Here's a sample configuration: - ```python - | { - | "version": "ignore", - | "components": [ - | { # define all the building-blocks for Pipeline - | "name": "MyReader", # custom-name for the component; helpful for visualization & debugging - | "type": "FARMReader", # Haystack Class name for the component - | "params": {"no_ans_boost": -10, "model_name_or_path": "deepset/roberta-base-squad2"}, - | }, - | { - | "name": "MyESRetriever", - | "type": "BM25Retriever", - | "params": { - | "document_store": "MyDocumentStore", # params can reference other components defined in the YAML - | "custom_query": None, - | }, - | }, - | {"name": "MyDocumentStore", "type": "ElasticsearchDocumentStore", "params": {"index": "haystack_test"}}, - | ], - | "pipelines": [ - | { # multiple Pipelines can be defined using the components from above - | "name": "my_query_pipeline", # a simple extractive-qa Pipeline - | "nodes": [ - | {"name": "MyESRetriever", "inputs": ["Query"]}, - | {"name": "MyReader", "inputs": ["MyESRetriever"]}, - | ], - | } - | ], - | } - ``` + ```python + { + "version": "ignore", + "components": [ + { # define all the building-blocks for Pipeline + "name": "MyReader", # custom-name for the component; helpful for visualization & debugging + "type": "FARMReader", # Haystack Class name for the component + "params": {"no_ans_boost": -10, "model_name_or_path": "deepset/roberta-base-squad2"}, + }, + { + "name": "MyRetriever", + "type": "BM25Retriever", + "params": { + "document_store": "MyDocumentStore", # params can reference other components defined in the YAML + "custom_query": None, + }, + }, + {"name": "MyDocumentStore", "type": "ElasticsearchDocumentStore", "params": {"index": "haystack_test"}}, + ], + "pipelines": [ + { # multiple Pipelines can be defined using the components from above + "name": "my_query_pipeline", # a simple extractive-qa Pipeline + "nodes": [ + {"name": "MyRetriever", "inputs": ["Query"]}, + {"name": "MyReader", "inputs": ["MyRetriever"]}, + ], + } + ], + } + ``` :param pipeline_config: the pipeline config as dict :param pipeline_name: if the config contains multiple pipelines, the pipeline_name to load must be set. diff --git a/haystack/pipelines/ray.py b/haystack/pipelines/ray.py index 23f6968cb9..298a4bd7bc 100644 --- a/haystack/pipelines/ray.py +++ b/haystack/pipelines/ray.py @@ -31,19 +31,19 @@ class RayPipeline(Pipeline): To set the number of replicas, add `num_replicas` in the YAML configuration for the node in a pipeline: - ```yaml - | components: - | ... - | - | pipelines: - | - name: ray_query_pipeline - | type: RayPipeline - | nodes: - | - name: ESRetriever - | inputs: [ Query ] - | serve_deployment_kwargs: - | num_replicas: 2 # number of replicas to create on the Ray cluster - ``` + ```yaml + components: + ... + + pipelines: + - name: ray_query_pipeline + type: RayPipeline + nodes: + - name: Retriever + inputs: [ Query ] + serve_deployment_kwargs: + num_replicas: 2 # number of replicas to create on the Ray cluster + ``` A Ray Pipeline can only be created with a YAML Pipeline configuration. @@ -139,36 +139,36 @@ def load_from_yaml( # type: ignore Here's a sample configuration: - ```yaml - | version: '1.0.0' - | - | components: # define all the building-blocks for Pipeline - | - name: MyReader # custom-name for the component; helpful for visualization & debugging - | type: FARMReader # Haystack Class name for the component - | params: - | no_ans_boost: -10 - | model_name_or_path: deepset/roberta-base-squad2 - | - name: MyESRetriever - | type: ElasticsearchRetriever - | params: - | document_store: MyDocumentStore # params can reference other components defined in the YAML - | custom_query: null - | - name: MyDocumentStore - | type: ElasticsearchDocumentStore - | params: - | index: haystack_test - | - | pipelines: # multiple Pipelines can be defined using the components from above - | - name: my_query_pipeline # a simple extractive-qa Pipeline - | type: RayPipeline - | nodes: - | - name: MyESRetriever - | inputs: [Query] - | serve_deployment_kwargs: - | num_replicas: 2 # number of replicas to create on the Ray cluster - | - name: MyReader - | inputs: [MyESRetriever] - ``` + ```yaml + version: '1.0.0' + + components: # define all the building-blocks for Pipeline + - name: MyReader # custom-name for the component; helpful for visualization & debugging + type: FARMReader # Haystack Class name for the component + params: + no_ans_boost: -10 + model_name_or_path: deepset/roberta-base-squad2 + - name: MyRetriever + type: BM25Retriever + params: + document_store: MyDocumentStore # params can reference other components defined in the YAML + custom_query: null + - name: MyDocumentStore + type: ElasticsearchDocumentStore + params: + index: haystack_test + + pipelines: # multiple Pipelines can be defined using the components from above + - name: my_query_pipeline # a simple extractive-qa Pipeline + type: RayPipeline + nodes: + - name: MyRetriever + inputs: [Query] + serve_deployment_kwargs: + num_replicas: 2 # number of replicas to create on the Ray cluster + - name: MyReader + inputs: [MyRetriever] + ``` Note that, in case of a mismatch in version between Haystack and the YAML, a warning will be printed. diff --git a/haystack/pipelines/standard_pipelines.py b/haystack/pipelines/standard_pipelines.py index 4267eabf27..428e636d74 100644 --- a/haystack/pipelines/standard_pipelines.py +++ b/haystack/pipelines/standard_pipelines.py @@ -85,9 +85,9 @@ def get_nodes_by_class(self, class_type) -> List[Any]: This is for example helpful if you loaded a pipeline and then want to interact directly with the document store. Example: ```python - | from haystack.document_stores.base import BaseDocumentStore - | INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME) - | res = INDEXING_PIPELINE.get_nodes_by_class(class_type=BaseDocumentStore) + from haystack.document_stores.base import BaseDocumentStore + INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME) + res = INDEXING_PIPELINE.get_nodes_by_class(class_type=BaseDocumentStore) ``` :return: List of components that are an instance of the requested class """ diff --git a/haystack/schema.py b/haystack/schema.py index 3b179a03db..0b9aa4d8da 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -140,9 +140,13 @@ def to_dict(self, field_map={}) -> Dict: resulting dict. This way you can work with standardized Document objects in Haystack, but adjust the format that they are serialized / stored in other places (e.g. elasticsearch) Example: - | doc = Document(content="some text", content_type="text") - | doc.to_dict(field_map={"custom_content_field": "content"}) - | >>> {"custom_content_field": "some text", content_type": "text"} + + ```python + doc = Document(content="some text", content_type="text") + doc.to_dict(field_map={"custom_content_field": "content"}) + + # Returns {"custom_content_field": "some text", "content_type": "text"} + ``` :param field_map: Dict with keys being the custom target keys and values being the standard Document attributes :return: dict with content of the Document @@ -170,8 +174,11 @@ def from_dict( input dict. This way you can work with standardized Document objects in Haystack, but adjust the format that they are serialized / stored in other places (e.g. elasticsearch) Example: - | my_dict = {"custom_content_field": "some text", content_type": "text"} - | Document.from_dict(my_dict, field_map={"custom_content_field": "content"}) + + ```python + my_dict = {"custom_content_field": "some text", content_type": "text"} + Document.from_dict(my_dict, field_map={"custom_content_field": "content"}) + ``` :param field_map: Dict with keys being the custom target keys and values being the standard Document attributes :return: dict with content of the Document @@ -791,13 +798,13 @@ def __init__(self, node_results: Optional[Dict[str, pd.DataFrame]] = None) -> No For example, you can calculate eval metrics, get detailed reports, or simulate different top_k settings: ```python - | eval_results = pipeline.eval(...) - | - | # derive detailed metrics - | eval_results.calculate_metrics() - | - | # show summary of incorrect queries - | eval_results.wrong_examples() + eval_results = pipeline.eval(...) + + # derive detailed metrics + eval_results.calculate_metrics() + + # show summary of incorrect queries + eval_results.wrong_examples() ``` Each row of the underlying DataFrames contains either an answer or a document that has been retrieved during evaluation. diff --git a/haystack/utils/deepsetcloud.py b/haystack/utils/deepsetcloud.py index a0e980587e..b9a0549297 100644 --- a/haystack/utils/deepsetcloud.py +++ b/haystack/utils/deepsetcloud.py @@ -519,15 +519,19 @@ def list_pipeline_configs(self, workspace: Optional[str] = None, headers: Option "..." -> additional pipeline meta information } example: - [{'name': 'my_super_nice_pipeline_config', - 'pipeline_id': '2184e0c1-c6ec-40a1-9b28-5d2768e5efa2', - 'status': 'DEPLOYED', - 'created_at': '2022-02-01T09:57:03.803991+00:00', - 'deleted': False, - 'is_default': False, - 'indexing': {'status': 'IN_PROGRESS', - 'pending_file_count': 3, - 'total_file_count': 31}}] + + ```python + [{'name': 'my_super_nice_pipeline_config', + 'pipeline_id': '2184e0c1-c6ec-40a1-9b28-5d2768e5efa2', + 'status': 'DEPLOYED', + 'created_at': '2022-02-01T09:57:03.803991+00:00', + 'deleted': False, + 'is_default': False, + 'indexing': {'status': 'IN_PROGRESS', + 'pending_file_count': 3, + 'total_file_count': 31}}] + ``` + """ workspace_url = self._build_workspace_url(workspace) pipelines_url = f"{workspace_url}/pipelines" @@ -1418,15 +1422,19 @@ def list_pipelines( "..." -> additional pipeline meta information } example: - [{'name': 'my_super_nice_pipeline_config', - 'pipeline_id': '2184e0c1-c6ec-40a1-9b28-5d2768e5efa2', - 'status': 'DEPLOYED', - 'created_at': '2022-02-01T09:57:03.803991+00:00', - 'deleted': False, - 'is_default': False, - 'indexing': {'status': 'IN_PROGRESS', - 'pending_file_count': 3, - 'total_file_count': 31}}] + + ```python + [{'name': 'my_super_nice_pipeline_config', + 'pipeline_id': '2184e0c1-c6ec-40a1-9b28-5d2768e5efa2', + 'status': 'DEPLOYED', + 'created_at': '2022-02-01T09:57:03.803991+00:00', + 'deleted': False, + 'is_default': False, + 'indexing': {'status': 'IN_PROGRESS', + 'pending_file_count': 3, + 'total_file_count': 31}}] + ``` + """ client = DeepsetCloud.get_pipeline_client(api_key=api_key, api_endpoint=api_endpoint, workspace=workspace) pipeline_config_infos = list(client.list_pipeline_configs()) @@ -1453,11 +1461,14 @@ def list_evaluation_sets( "..." -> additional pipeline meta information } example: - [{'evaluation_set_id': 'fb084729-57ad-4b57-9f78-ec0eb4d29c9f', - 'name': 'my-question-answering-evaluation-set', - 'created_at': '2022-05-06T09:54:14.830529+00:00', - 'matched_labels': 234, - 'total_labels': 234}] + + ```python + [{'evaluation_set_id': 'fb084729-57ad-4b57-9f78-ec0eb4d29c9f', + 'name': 'my-question-answering-evaluation-set', + 'created_at': '2022-05-06T09:54:14.830529+00:00', + 'matched_labels': 234, + 'total_labels': 234}] + ``` """ client = DeepsetCloud.get_evaluation_set_client(api_key=api_key, api_endpoint=api_endpoint, workspace=workspace) return client.get_evaluation_sets() @@ -1479,35 +1490,38 @@ def get_runs( Returns: list of dictionaries: List[dict] example: - [{'eval_run_name': 'my-eval-run-1', - 'parameters': { - 'pipeline_name': 'my-pipeline-1_696bc5d0-ee65-46c1-a308-059507bc353b', - 'evaluation_set_name': 'my-eval-set-name', - 'debug': False, - 'eval_mode': 0 - }, - 'metrics': { - 'isolated_exact_match': 0.45, - 'isolated_f1': 0.89, - 'isolated_sas': 0.91, - 'integrated_exact_match': 0.39, - 'integrated_f1': 0.76, - 'integrated_sas': 0.78, - 'mean_reciprocal_rank': 0.77, - 'mean_average_precision': 0.78, - 'recall_single_hit': 0.91, - 'recall_multi_hit': 0.91, - 'normal_discounted_cummulative_gain': 0.83, - 'precision': 0.52 - }, - 'logs': {}, - 'status': 1, - 'eval_mode': 0, - 'eval_run_labels': [], - 'created_at': '2022-05-24T12:13:16.445857+00:00', - 'comment': 'This is a comment about thiseval run', - 'tags': ['experiment-1', 'experiment-2', 'experiment-3'] - }] + + ```python + [{'eval_run_name': 'my-eval-run-1', + 'parameters': { + 'pipeline_name': 'my-pipeline-1_696bc5d0-ee65-46c1-a308-059507bc353b', + 'evaluation_set_name': 'my-eval-set-name', + 'debug': False, + 'eval_mode': 0 + }, + 'metrics': { + 'isolated_exact_match': 0.45, + 'isolated_f1': 0.89, + 'isolated_sas': 0.91, + 'integrated_exact_match': 0.39, + 'integrated_f1': 0.76, + 'integrated_sas': 0.78, + 'mean_reciprocal_rank': 0.77, + 'mean_average_precision': 0.78, + 'recall_single_hit': 0.91, + 'recall_multi_hit': 0.91, + 'normal_discounted_cummulative_gain': 0.83, + 'precision': 0.52 + }, + 'logs': {}, + 'status': 1, + 'eval_mode': 0, + 'eval_run_labels': [], + 'created_at': '2022-05-24T12:13:16.445857+00:00', + 'comment': 'This is a comment about thiseval run', + 'tags': ['experiment-1', 'experiment-2', 'experiment-3'] + }] + ``` """ client = DeepsetCloud.get_eval_run_client(api_key=api_key, api_endpoint=api_endpoint, workspace=workspace) return client.get_eval_runs()