deepset-ai · bogdankostic · Oct 26, 2022 · Oct 19, 2022 · Oct 19, 2022 · Oct 20, 2022
@@ -2421,6 +2421,11 @@
           "additionalProperties": false,
           "description": "Each parameter can reference other components defined in the same YAML file.",
           "properties": {
+            "extract_headlines": {
+              "default": false,
+              "title": "Extract Headlines",
+              "type": "boolean"
+            },
             "id_hash_keys": {
               "anyOf": [
                 {
@@ -2440,6 +2445,11 @@
               "title": "Progress Bar",
               "type": "boolean"
             },
+            "remove_code_snippets": {
+              "default": true,
+              "title": "Remove Code Snippets",
+              "type": "boolean"
+            },
             "remove_numeric_tables": {
               "default": false,
               "title": "Remove Numeric Tables",

@@ -2421,6 +2421,11 @@
           "additionalProperties": false,
           "description": "Each parameter can reference other components defined in the same YAML file.",
           "properties": {
+            "extract_headlines": {
+              "default": false,
+              "title": "Extract Headlines",
+              "type": "boolean"
+            },
             "id_hash_keys": {
               "anyOf": [
                 {
@@ -2440,6 +2445,11 @@
               "title": "Progress Bar",
               "type": "boolean"
             },
+            "remove_code_snippets": {
+              "default": true,
+              "title": "Remove Code Snippets",
+              "type": "boolean"
+            },
             "remove_numeric_tables": {
               "default": false,
               "title": "Remove Numeric Tables",

@@ -1,10 +1,10 @@
 import logging
 import re
 from pathlib import Path
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple, Any
 
 try:
-    from bs4 import BeautifulSoup
+    from bs4 import BeautifulSoup, NavigableString
     from markdown import markdown
 except (ImportError, ModuleNotFoundError) as ie:
     from haystack.utils.import_utils import _optional_component_not_installed
@@ -19,14 +19,46 @@
 
 
 class MarkdownConverter(BaseConverter):
+    def __init__(
+        self,
+        remove_numeric_tables: bool = False,
+        valid_languages: Optional[List[str]] = None,
+        id_hash_keys: Optional[List[str]] = None,
+        progress_bar: bool = True,
+        remove_code_snippets: bool = True,
+        extract_headlines: bool = False,
+    ):
+        """
+        :param remove_numeric_tables: Not applicable.
+        :param valid_languages: Not applicable.
+        :param id_hash_keys: Generate the document ID from a custom list of strings that refer to the document's
+            attributes. To make sure you don't have duplicate documents in your DocumentStore if texts are
+            not unique, you can modify the metadata and pass for example, `"meta"` to this field ([`"content"`, `"meta"`]).
+            In this case, the ID is generated by using the content and the defined metadata.
+        :param progress_bar: Show a progress bar for the conversion.
+        :param remove_code_snippets: Whether to remove snippets from the markdown file.
+        :param extract_headlines: Whether to extract headings from the markdown file.
+        """
+        super().__init__(
+            remove_numeric_tables=remove_numeric_tables,
+            valid_languages=valid_languages,
+            id_hash_keys=id_hash_keys,
+            progress_bar=progress_bar,
+        )
+
+        self.remove_code_snippets = remove_code_snippets
+        self.extract_headlines = extract_headlines
+
     def convert(
         self,
         file_path: Path,
-        meta: Optional[Dict[str, str]] = None,
+        meta: Optional[Dict[str, Any]] = None,
         remove_numeric_tables: Optional[bool] = None,
         valid_languages: Optional[List[str]] = None,
         encoding: Optional[str] = "utf-8",
         id_hash_keys: Optional[List[str]] = None,
+        remove_code_snippets: Optional[bool] = None,
+        extract_headlines: Optional[bool] = None,
     ) -> List[Document]:
         """
         Reads text from a markdown file and executes optional preprocessing steps.
@@ -40,32 +72,53 @@ def convert(
             attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
             not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
             In this case the id will be generated by using the content and the defined metadata.
+        :param remove_code_snippets: Whether to remove snippets from the markdown file.
+        :param extract_headlines: Whether to extract headings from the markdown file.
         """
-        if id_hash_keys is None:
-            id_hash_keys = self.id_hash_keys
+
+        id_hash_keys = id_hash_keys if id_hash_keys is not None else self.id_hash_keys
+        remove_code_snippets = remove_code_snippets if remove_code_snippets is not None else self.remove_code_snippets
+        extract_headlines = extract_headlines if extract_headlines is not None else self.extract_headlines
+
         with open(file_path, encoding=encoding, errors="ignore") as f:
             markdown_text = f.read()
-        text = self.markdown_to_text(markdown_text)
+
+        # md -> html -> text since BeautifulSoup can extract text cleanly
+        html = markdown(markdown_text)
+
+        # remove code snippets
+        if remove_code_snippets:
+            html = re.sub(r"<pre>(.*?)</pre>", " ", html, flags=re.DOTALL)
+            html = re.sub(r"<code>(.*?)</code>", " ", html, flags=re.DOTALL)
+        soup = BeautifulSoup(html, "html.parser")
+
+        if extract_headlines:
+            text, headlines = self._extract_text_and_headlines(soup)
+            if meta is None:
+                meta = {}
+            meta["headlines"] = headlines
+        else:
+            text = soup.get_text()
+
         document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
         return [document]
 
-    # Following code snippet is copied from https://gist.github.com/lorey/eb15a7f3338f959a78cc3661fbc255fe
     @staticmethod
-    def markdown_to_text(markdown_string: str) -> str:
+    def _extract_text_and_headlines(soup: BeautifulSoup) -> Tuple[str, List[Dict]]:
         """
-        Converts a markdown string to plaintext
-
-        :param markdown_string: String in markdown format
+        Extracts text and headings from a soup object.
         """
-        # md -> html -> text since BeautifulSoup can extract text cleanly
-        html = markdown(markdown_string)
+        headline_tags = {"h1", "h2", "h3", "h4", "h5", "h6"}
+        headlines = []
+        text = ""
+        for desc in soup.descendants:
+            if desc.name in headline_tags:
+                current_headline = desc.get_text()
+                current_start_idx = len(text)
+                current_level = int(desc.name[-1]) - 1
+                headlines.append({"headline": current_headline, "start_idx": current_start_idx, "level": current_level})
 
-        # remove code snippets
-        html = re.sub(r"<pre>(.*?)</pre>", " ", html)
-        html = re.sub(r"<code>(.*?)</code >", " ", html)
-
-        # extract text
-        soup = BeautifulSoup(html, "html.parser")
-        text = "".join(soup.findAll(text=True))
+            if isinstance(desc, NavigableString):
+                text += desc.get_text()
 
-        return text
+        return text, headlines