update docs for 1.2.0 relase (#2234)

deepset-ai · Feb 23, 2022 · d21b6a5 · d21b6a5
1 parent 3e39ce0
commit d21b6a5
Show file tree

Hide file tree

Showing 133 changed files with 18,811 additions and 0 deletions.
diff --git a/docs/v1.2.0/Makefile b/docs/v1.2.0/Makefile
@@ -0,0 +1,26 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+
+SPHINXBUILD := sphinx-build
+MAKEINFO    := makeinfo
+
+BUILDDIR    := build
+SOURCE      := _src/
+# SPHINXFLAGS := -a -W -n -A local=1 -d $(BUILDDIR)/doctree
+SPHINXFLAGS := -A local=1 -d $(BUILDDIR)/doctree
+SPHINXOPTS  := $(SPHINXFLAGS) $(SOURCE)
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	$(SPHINXBUILD) -M $@ $(SPHINXOPTS) $(BUILDDIR)/$@
+
diff --git a/docs/v1.2.0/_src/api/Makefile b/docs/v1.2.0/_src/api/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/v1.2.0/_src/api/_static/floating_sidebar.css b/docs/v1.2.0/_src/api/_static/floating_sidebar.css
@@ -0,0 +1,29 @@
+div.sphinxsidebarwrapper {
+    position: relative;
+    top: 0px;
+    padding: 0;
+}
+
+div.sphinxsidebar {
+    margin: 0;
+    padding: 0 15px 0 15px;
+    width: 210px;
+    float: left;
+    font-size: 1em;
+    text-align: left;
+}
+
+div.sphinxsidebar .logo {
+    font-size: 1.8em;
+    color: #0A507A;
+    font-weight: 300;
+    text-align: center;
+}
+
+div.sphinxsidebar .logo img {
+    vertical-align: middle;
+}
+
+div.sphinxsidebar .download a img {
+    vertical-align: middle;
+}
diff --git a/docs/v1.2.0/_src/api/_templates/xxlayout.html b/docs/v1.2.0/_src/api/_templates/xxlayout.html
@@ -0,0 +1,46 @@
+{# put the sidebar before the body #}
+{% block sidebar1 %}{{ sidebar() }}{% endblock %}
+{% block sidebar2 %}{% endblock %}
+
+{% block extrahead %}
+    <link href='https://fonts.googleapis.com/css?family=Open+Sans:300,400,700'
+          rel='stylesheet' type='text/css' />
+{{ super() }}
+{#- if not embedded #}
+    <style type="text/css">
+      table.right { float: left; margin-left: 20px; }
+      table.right td { border: 1px solid #ccc; }
+      {% if pagename == 'index' %}
+      .related { display: none; }
+      {% endif %}
+    </style>
+    <script>
+      // intelligent scrolling of the sidebar content
+      $(window).scroll(function() {
+        var sb = $('.sphinxsidebarwrapper');
+        var win = $(window);
+        var sbh = sb.height();
+        var offset = $('.sphinxsidebar').position()['top'];
+        var wintop = win.scrollTop();
+        var winbot = wintop + win.innerHeight();
+        var curtop = sb.position()['top'];
+        var curbot = curtop + sbh;
+        // does sidebar fit in window?
+        if (sbh < win.innerHeight()) {
+          // yes: easy case -- always keep at the top
+          sb.css('top', $u.min([$u.max([0, wintop - offset - 10]),
+                                $(document).height() - sbh - 200]));
+        } else {
+          // no: only scroll if top/bottom edge of sidebar is at
+          // top/bottom edge of window
+          if (curtop > wintop && curbot > winbot) {
+            sb.css('top', $u.max([wintop - offset - 10, 0]));
+          } else if (curtop < wintop && curbot < winbot) {
+            sb.css('top', $u.min([winbot - sbh - offset - 20,
+                                  $(document).height() - sbh - 200]));
+          }
+        }
+      });
+    </script>
+{#- endif #}
+{% endblock %}
diff --git a/docs/v1.2.0/_src/api/api/crawler.md b/docs/v1.2.0/_src/api/api/crawler.md
@@ -0,0 +1,80 @@
+<a id="crawler"></a>
+
+# Module crawler
+
+<a id="crawler.Crawler"></a>
+
+## Crawler
+
+```python
+class Crawler(BaseComponent)
+```
+
+Crawl texts from a website so that we can use them later in Haystack as a corpus for search / question answering etc.
+
+**Example:**
+```python
+|    from haystack.nodes.connector import Crawler
+|
+|    crawler = Crawler(output_dir="crawled_files")
+|    # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/overview/
+|    docs = crawler.crawl(urls=["https://haystack.deepset.ai/overview/get-started"],
+|                         filter_urls= ["haystack\.deepset\.ai\/overview\/"])
+```
+
+<a id="crawler.Crawler.crawl"></a>
+
+#### crawl
+
+```python
+def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None) -> List[Path]
+```
+
+Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
+
+file per URL, including text and basic meta data).
+You can optionally specify via `filter_urls` to only crawl URLs that match a certain pattern.
+All parameters are optional here and only meant to overwrite instance attributes at runtime.
+If no parameters are provided to this method, the instance attributes that were passed during __init__ will be used.
+
+**Arguments**:
+
+- `output_dir`: Path for the directory to store files
+- `urls`: List of http addresses or single http address
+- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options:
+0: Only initial list of urls
+1: Follow links found on the initial URLs (but no further)
+- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with.
+All URLs not matching at least one of the regular expressions will be dropped.
+- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
+
+**Returns**:
+
+List of paths where the crawled webpages got stored
+
+<a id="crawler.Crawler.run"></a>
+
+#### run
+
+```python
+def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False) -> Tuple[Dict, str]
+```
+
+Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
+
+**Arguments**:
+
+- `output_dir`: Path for the directory to store files
+- `urls`: List of http addresses or single http address
+- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options:
+0: Only initial list of urls
+1: Follow links found on the initial URLs (but no further)
+- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with.
+All URLs not matching at least one of the regular expressions will be dropped.
+- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
+- `return_documents`: Return json files content
+
+**Returns**:
+
+Tuple({"paths": List of filepaths, ...}, Name of output edge)
+
diff --git a/docs/v1.2.0/_src/api/api/document_classifier.md b/docs/v1.2.0/_src/api/api/document_classifier.md
@@ -0,0 +1,101 @@
+<a id="base"></a>
+
+# Module base
+
+<a id="base.BaseDocumentClassifier"></a>
+
+## BaseDocumentClassifier
+
+```python
+class BaseDocumentClassifier(BaseComponent)
+```
+
+<a id="base.BaseDocumentClassifier.timing"></a>
+
+#### timing
+
+```python
+def timing(fn, attr_name)
+```
+
+Wrapper method used to time functions.
+
+<a id="transformers"></a>
+
+# Module transformers
+
+<a id="transformers.TransformersDocumentClassifier"></a>
+
+## TransformersDocumentClassifier
+
+```python
+class TransformersDocumentClassifier(BaseDocumentClassifier)
+```
+
+Transformer based model for document classification using the HuggingFace's transformers framework
+(https://github.com/huggingface/transformers).
+While the underlying model can vary (BERT, Roberta, DistilBERT ...), the interface remains the same.
+This node classifies documents and adds the output from the classification step to the document's meta data.
+The meta field of the document is a dictionary with the following format:
+``'meta': {'name': '450_Baelor.txt', 'classification': {'label': 'neutral', 'probability' = 0.9997646, ...} }``
+
+Classification is run on document's content field by default. If you want it to run on another field,
+set the `classification_field` to one of document's meta fields.
+
+With this document_classifier, you can directly get predictions via predict()
+
+ **Usage example at query time:**
+ ```python
+|    ...
+|    retriever = ElasticsearchRetriever(document_store=document_store)
+|    document_classifier = TransformersDocumentClassifier(model_name_or_path="bhadresh-savani/distilbert-base-uncased-emotion")
+|    p = Pipeline()
+|    p.add_node(component=retriever, name="Retriever", inputs=["Query"])
+|    p.add_node(component=document_classifier, name="Classifier", inputs=["Retriever"])
+|    res = p.run(
+|        query="Who is the father of Arya Stark?",
+|        params={"Retriever": {"top_k": 10}}
+|    )
+|
+|    # print the classification results
+|    print_documents(res, max_text_len=100, print_meta=True)
+|    # or access the predicted class label directly
+|    res["documents"][0].to_dict()["meta"]["classification"]["label"]
+ ```
+
+**Usage example at index time:**
+ ```python
+|    ...
+|    converter = TextConverter()
+|    preprocessor = Preprocessor()
+|    document_store = ElasticsearchDocumentStore()
+|    document_classifier = TransformersDocumentClassifier(model_name_or_path="bhadresh-savani/distilbert-base-uncased-emotion",
+|                                                         batch_size=16)
+|    p = Pipeline()
+|    p.add_node(component=converter, name="TextConverter", inputs=["File"])
+|    p.add_node(component=preprocessor, name="Preprocessor", inputs=["TextConverter"])
+|    p.add_node(component=document_classifier, name="DocumentClassifier", inputs=["Preprocessor"])
+|    p.add_node(component=document_store, name="DocumentStore", inputs=["DocumentClassifier"])
+|    p.run(file_paths=file_paths)
+ ```
+
+<a id="transformers.TransformersDocumentClassifier.predict"></a>
+
+#### predict
+
+```python
+def predict(documents: List[Document]) -> List[Document]
+```
+
+Returns documents containing classification result in meta field.
+
+Documents are updated in place.
+
+**Arguments**:
+
+- `documents`: List of Document to classify
+
+**Returns**:
+
+List of Document enriched with meta information
+