deepset-ai · bogdankostic · Aug 9, 2022 · Jul 29, 2022 · Jul 30, 2022 · Jul 30, 2022
diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md
@@ -44,7 +44,7 @@ In this case the id will be generated by using the content and the defined metad
 
 ```python
 @abstractmethod
-def convert(file_path: Path, meta: Optional[Dict[str, str]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
+def convert(file_path: Path, meta: Optional[Dict[str, Any]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
 ```
 
 Convert a file to a dictionary containing the text and any associated meta data.
@@ -424,6 +424,198 @@ attributes. If you want to ensure you don't have duplicate documents in your Doc
 not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
 In this case the id will be generated by using the content and the defined metadata.
 
+<a id="parsr"></a>
+
+# Module parsr
+
+<a id="parsr.ParsrConverter"></a>
+
+## ParsrConverter
+
+```python
+class ParsrConverter(BaseConverter)
+```
+
+File converter that makes use of the open-source Parsr tool by axa-group.
+(https://github.com/axa-group/Parsr).
+This Converter extracts both text and tables.
+Supported file formats are: PDF, DOCX
+
+<a id="parsr.ParsrConverter.__init__"></a>
+
+#### ParsrConverter.\_\_init\_\_
+
+```python
+def __init__(parsr_url: str = "http://localhost:3001", extractor: Literal["pdfminer", "pdfjs"] = "pdfminer", table_detection_mode: Literal["lattice", "stream"] = "lattice", preceding_context_len: int = 3, following_context_len: int = 3, remove_page_headers: bool = False, remove_page_footers: bool = False, remove_table_of_contents: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, add_page_number: bool = True)
+```
+
+**Arguments**:
+
+- `parsr_url`: URL endpoint to Parsr"s REST API.
+- `extractor`: Backend used to extract textual structured from PDFs. ("pdfminer" or "pdfjs")
+- `table_detection_mode`: Parsing method used to detect tables and their cells.
+"lattice" detects tables and their cells by demarcated lines between cells.
+"stream" detects tables and their cells by looking at whitespace between cells.
+- `preceding_context_len`: Number of lines before a table to extract as preceding context
+(will be returned as part of meta data).
+- `following_context_len`: Number of lines after a table to extract as preceding context
+(will be returned as part of meta data).
+- `remove_page_headers`: Whether to remove text that Parsr detected as a page header.
+- `remove_page_footers`: Whether to remove text that Parsr detected as a page footer.
+- `remove_table_of_contents`: Whether to remove text that Parsr detected as a table of contents.
+- `valid_languages`: Validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+- `add_page_number`: Adds the number of the page a table occurs in to the Document's meta field
+`"page"`.
+
+<a id="parsr.ParsrConverter.convert"></a>
+
+#### ParsrConverter.convert
+
+```python
+def convert(file_path: Path, meta: Optional[Dict[str, Any]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
+```
+
+Extract text and tables from a PDF or DOCX using the open-source Parsr tool.
+
+**Arguments**:
+
+- `file_path`: Path to the file you want to convert.
+- `meta`: Optional dictionary with metadata that shall be attached to all resulting documents.
+Can be any custom keys and values.
+- `remove_numeric_tables`: Not applicable.
+- `valid_languages`: Validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.
+- `encoding`: Not applicable.
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+
+<a id="azure"></a>
+
+# Module azure
+
+<a id="azure.AzureConverter"></a>
+
+## AzureConverter
+
+```python
+class AzureConverter(BaseConverter)
+```
+
+File converter that makes use of Microsoft Azure's Form Recognizer service
+(https://azure.microsoft.com/en-us/services/form-recognizer/).
+This Converter extracts both text and tables.
+Supported file formats are: PDF, JPEG, PNG, BMP and TIFF.
+
+In order to be able to use this Converter, you need an active Azure account
+and a Form Recognizer or Cognitive Services resource.
+(Here you can find information on how to set this up:
+https://docs.microsoft.com/en-us/azure/applied-ai-services/form-recognizer/quickstarts/try-v3-python-sdk#prerequisites)
+
+<a id="azure.AzureConverter.__init__"></a>
+
+#### AzureConverter.\_\_init\_\_
+
+```python
+def __init__(endpoint: str, credential_key: str, model_id: str = "prebuilt-document", valid_languages: Optional[List[str]] = None, save_json: bool = False, preceding_context_len: int = 3, following_context_len: int = 3, merge_multiple_column_headers: bool = True, id_hash_keys: Optional[List[str]] = None, add_page_number: bool = True)
+```
+
+**Arguments**:
+
+- `endpoint`: Your Form Recognizer or Cognitive Services resource's endpoint.
+- `credential_key`: Your Form Recognizer or Cognitive Services resource's subscription key.
+- `model_id`: The identifier of the model you want to use to extract information out of your file.
+Default: "prebuilt-document". General purpose models are "prebuilt-document"
+and "prebuilt-layout".
+List of available prebuilt models:
+https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.2.0b1/index.html#documentanalysisclient
+- `valid_languages`: Validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.
+- `save_json`: Whether to save the output of the Form Recognizer to a JSON file.
+- `preceding_context_len`: Number of lines before a table to extract as preceding context (will be returned as part of meta data).
+- `following_context_len`: Number of lines after a table to extract as subsequent context (will be returned as part of meta data).
+- `merge_multiple_column_headers`: Some tables contain more than one row as a column header (i.e., column description).
+This parameter lets you choose, whether to merge multiple column header
+rows to a single row.
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+- `add_page_number`: Adds the number of the page a table occurs in to the Document's meta field
+`"page"`.
+
+<a id="azure.AzureConverter.convert"></a>
+
+#### AzureConverter.convert
+
+```python
+def convert(file_path: Path, meta: Optional[Dict[str, Any]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None, pages: Optional[str] = None, known_language: Optional[str] = None) -> List[Document]
+```
+
+Extract text and tables from a PDF, JPEG, PNG, BMP or TIFF file using Azure's Form Recognizer service.
+
+**Arguments**:
+
+- `file_path`: Path to the file you want to convert.
+- `meta`: Optional dictionary with metadata that shall be attached to all resulting documents.
+Can be any custom keys and values.
+- `remove_numeric_tables`: Not applicable.
+- `valid_languages`: Validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.
+- `encoding`: Not applicable.
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+- `pages`: Custom page numbers for multi-page documents(PDF/TIFF). Input the page numbers and/or ranges
+of pages you want to get in the result. For a range of pages, use a hyphen,
+like pages=”1-3, 5-6”. Separate each page number or range with a comma.
+- `known_language`: Locale hint of the input document.
+See supported locales here: https://aka.ms/azsdk/formrecognizer/supportedlocales.
+
+<a id="azure.AzureConverter.convert_azure_json"></a>
+
+#### AzureConverter.convert\_azure\_json
+
+```python
+def convert_azure_json(file_path: Path, meta: Optional[Dict[str, Any]] = None, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
+```
+
+Extract text and tables from the JSON output of Azure's Form Recognizer service.
+
+**Arguments**:
+
+- `file_path`: Path to the JSON-file you want to convert.
+- `meta`: Optional dictionary with metadata that shall be attached to all resulting documents.
+Can be any custom keys and values.
+- `valid_languages`: Validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+
 <a id="tika"></a>
 
 # Module tika

diff --git a/docs/_src/api/api/preprocessor.md b/docs/_src/api/api/preprocessor.md
@@ -39,7 +39,7 @@ class PreProcessor(BasePreProcessor)
 #### PreProcessor.\_\_init\_\_
 
 ```python
-def __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, remove_substrings: List[str] = [], split_by: str = "word", split_length: int = 200, split_overlap: int = 0, split_respect_sentence_boundary: bool = True, tokenizer_model_folder: Optional[Union[str, Path]] = None, language: str = "en", id_hash_keys: Optional[List[str]] = None, progress_bar: bool = True)
+def __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, remove_substrings: List[str] = [], split_by: str = "word", split_length: int = 200, split_overlap: int = 0, split_respect_sentence_boundary: bool = True, tokenizer_model_folder: Optional[Union[str, Path]] = None, language: str = "en", id_hash_keys: Optional[List[str]] = None, progress_bar: bool = True, add_page_number: bool = False)
 ```
 
 **Arguments**:
@@ -70,6 +70,10 @@ attributes. If you want to ensure you don't have duplicate documents in your Doc
 not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
 In this case the id will be generated by using the content and the defined metadata.
 - `progress_bar`: Whether to show a progress bar.
+- `add_page_number`: Add the number of the page a paragraph occurs in to the Document's meta
+field `"page"`. Page boundaries are determined by `"\f"' character which is added
+in between pages by `PDFToTextConverter`, `TikaConverter`, `ParsrConverter` and
+`AzureConverter`.
 
 <a id="preprocessor.PreProcessor.process"></a>
 

diff --git a/docs/_src/api/pydoc/file-converters.yml b/docs/_src/api/pydoc/file-converters.yml
@@ -1,11 +1,11 @@
 loaders:
   - type: python
     search_path: [../../../../haystack/nodes/file_converter]
-    modules: ['base', 'docx', 'image', 'markdown', 'pdf', 'tika', 'txt']
+    modules: ['base', 'docx', 'image', 'markdown', 'pdf', 'parsr', 'azure', 'tika', 'txt']
     ignore_when_discovered: ['__init__']
 processors:
   - type: filter
-    expression: 
+    expression:
     documented_only: true
     do_not_filter_modules: false
     skip_empty_modules: true

diff --git a/haystack/json-schemas/haystack-pipeline-master.schema.json b/haystack/json-schemas/haystack-pipeline-master.schema.json
@@ -1917,6 +1917,11 @@
               "items": {
                 "type": "string"
               }
+            },
+            "add_page_number": {
+              "title": "Add Page Number",
+              "default": true,
+              "type": "boolean"
             }
           },
           "required": [
@@ -3557,6 +3562,11 @@
               "items": {
                 "type": "string"
               }
+            },
+            "add_page_number": {
+              "title": "Add Page Number",
+              "default": true,
+              "type": "boolean"
             }
           },
           "additionalProperties": false,
@@ -3658,6 +3668,11 @@
               "title": "Progress Bar",
               "default": true,
               "type": "boolean"
+            },
+            "add_page_number": {
+              "title": "Add Page Number",
+              "default": false,
+              "type": "boolean"
             }
           },
           "additionalProperties": false,

diff --git a/haystack/nodes/file_converter/azure.py b/haystack/nodes/file_converter/azure.py
@@ -1,6 +1,6 @@
 import logging
 from pathlib import Path
-from typing import List, Optional, Dict
+from typing import List, Optional, Dict, Any
 from collections import defaultdict
 import json
 import copy
@@ -42,6 +42,7 @@ def __init__(
         following_context_len: int = 3,
         merge_multiple_column_headers: bool = True,
         id_hash_keys: Optional[List[str]] = None,
+        add_page_number: bool = True,
     ):
         """
         :param endpoint: Your Form Recognizer or Cognitive Services resource's endpoint.
@@ -66,6 +67,8 @@ def __init__(
             attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
             not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
             In this case the id will be generated by using the content and the defined metadata.
+        :param add_page_number: Adds the number of the page a table occurs in to the Document's meta field
+                                `"page"`.
         """
         super().__init__(valid_languages=valid_languages, id_hash_keys=id_hash_keys)
 
@@ -78,11 +81,12 @@ def __init__(
         self.preceding_context_len = preceding_context_len
         self.following_context_len = following_context_len
         self.merge_multiple_column_headers = merge_multiple_column_headers
+        self.add_page_number = add_page_number
 
     def convert(
         self,
         file_path: Path,
-        meta: Optional[Dict[str, str]] = None,
+        meta: Optional[Dict[str, Any]] = None,
         remove_numeric_tables: Optional[bool] = None,
         valid_languages: Optional[List[str]] = None,
         encoding: Optional[str] = "utf-8",
@@ -140,7 +144,7 @@ def convert(
     def convert_azure_json(
         self,
         file_path: Path,
-        meta: Optional[Dict[str, str]] = None,
+        meta: Optional[Dict[str, Any]] = None,
         valid_languages: Optional[List[str]] = None,
         id_hash_keys: Optional[List[str]] = None,
     ) -> List[Document]:
@@ -177,7 +181,7 @@ def convert_azure_json(
     def _convert_tables_and_text(
         self,
         result: AnalyzeResult,
-        meta: Optional[Dict[str, str]],
+        meta: Optional[Dict[str, Any]],
         valid_languages: Optional[List[str]],
         file_path: Path,
         id_hash_keys: Optional[List[str]] = None,
@@ -204,7 +208,7 @@ def _convert_tables_and_text(
         return docs
 
     def _convert_tables(
-        self, result: AnalyzeResult, meta: Optional[Dict[str, str]], id_hash_keys: Optional[List[str]] = None
+        self, result: AnalyzeResult, meta: Optional[Dict[str, Any]], id_hash_keys: Optional[List[str]] = None
     ) -> List[Document]:
         converted_tables = []
 
@@ -274,6 +278,9 @@ def _convert_tables(
             else:
                 table_meta = {"preceding_context": preceding_context, "following_context": following_context}
 
+            if self.add_page_number:
+                table_meta["page"] = table.bounding_regions[0].page_number
+
             table_df = pd.DataFrame(columns=table_list[0], data=table_list[1:])
             converted_tables.append(
                 Document(content=table_df, content_type="table", meta=table_meta, id_hash_keys=id_hash_keys)

diff --git a/haystack/nodes/file_converter/base.py b/haystack/nodes/file_converter/base.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Dict, Union
+from typing import List, Optional, Dict, Union, Any
 
 from abc import abstractmethod
 from pathlib import Path
@@ -83,7 +83,7 @@ def __init__(
     def convert(
         self,
         file_path: Path,
-        meta: Optional[Dict[str, str]],
+        meta: Optional[Dict[str, Any]],
         remove_numeric_tables: Optional[bool] = None,
         valid_languages: Optional[List[str]] = None,
         encoding: Optional[str] = "UTF-8",