-
Notifications
You must be signed in to change notification settings - Fork 1.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Extraction of headlines in markdown files #3445
Changes from all commits
d661a0b
bcb4115
55b06fe
411f80c
e5662f1
3763116
62ab837
5c59081
bf76e3f
eea5a72
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,10 @@ | ||
import logging | ||
import re | ||
from pathlib import Path | ||
from typing import Dict, List, Optional | ||
from typing import Dict, List, Optional, Tuple, Any | ||
|
||
try: | ||
from bs4 import BeautifulSoup | ||
from bs4 import BeautifulSoup, NavigableString | ||
from markdown import markdown | ||
except (ImportError, ModuleNotFoundError) as ie: | ||
from haystack.utils.import_utils import _optional_component_not_installed | ||
|
@@ -19,14 +19,46 @@ | |
|
||
|
||
class MarkdownConverter(BaseConverter): | ||
def __init__( | ||
self, | ||
remove_numeric_tables: bool = False, | ||
valid_languages: Optional[List[str]] = None, | ||
id_hash_keys: Optional[List[str]] = None, | ||
progress_bar: bool = True, | ||
remove_code_snippets: bool = True, | ||
extract_headlines: bool = False, | ||
): | ||
""" | ||
:param remove_numeric_tables: Not applicable. | ||
:param valid_languages: Not applicable. | ||
:param id_hash_keys: Generate the document ID from a custom list of strings that refer to the document's | ||
attributes. To make sure you don't have duplicate documents in your DocumentStore if texts are | ||
not unique, you can modify the metadata and pass for example, `"meta"` to this field ([`"content"`, `"meta"`]). | ||
In this case, the ID is generated by using the content and the defined metadata. | ||
:param progress_bar: Show a progress bar for the conversion. | ||
:param remove_code_snippets: Whether to remove snippets from the markdown file. | ||
:param extract_headlines: Whether to extract headings from the markdown file. | ||
""" | ||
super().__init__( | ||
remove_numeric_tables=remove_numeric_tables, | ||
valid_languages=valid_languages, | ||
id_hash_keys=id_hash_keys, | ||
progress_bar=progress_bar, | ||
) | ||
|
||
self.remove_code_snippets = remove_code_snippets | ||
self.extract_headlines = extract_headlines | ||
|
||
def convert( | ||
self, | ||
file_path: Path, | ||
meta: Optional[Dict[str, str]] = None, | ||
meta: Optional[Dict[str, Any]] = None, | ||
remove_numeric_tables: Optional[bool] = None, | ||
valid_languages: Optional[List[str]] = None, | ||
encoding: Optional[str] = "utf-8", | ||
id_hash_keys: Optional[List[str]] = None, | ||
remove_code_snippets: Optional[bool] = None, | ||
extract_headlines: Optional[bool] = None, | ||
) -> List[Document]: | ||
""" | ||
Reads text from a markdown file and executes optional preprocessing steps. | ||
|
@@ -40,32 +72,53 @@ def convert( | |
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are | ||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). | ||
In this case the id will be generated by using the content and the defined metadata. | ||
:param remove_code_snippets: Whether to remove snippets from the markdown file. | ||
:param extract_headlines: Whether to extract headings from the markdown file. | ||
""" | ||
if id_hash_keys is None: | ||
id_hash_keys = self.id_hash_keys | ||
|
||
id_hash_keys = id_hash_keys if id_hash_keys is not None else self.id_hash_keys | ||
remove_code_snippets = remove_code_snippets if remove_code_snippets is not None else self.remove_code_snippets | ||
extract_headlines = extract_headlines if extract_headlines is not None else self.extract_headlines | ||
|
||
with open(file_path, encoding=encoding, errors="ignore") as f: | ||
markdown_text = f.read() | ||
text = self.markdown_to_text(markdown_text) | ||
|
||
# md -> html -> text since BeautifulSoup can extract text cleanly | ||
html = markdown(markdown_text) | ||
|
||
# remove code snippets | ||
if remove_code_snippets: | ||
html = re.sub(r"<pre>(.*?)</pre>", " ", html, flags=re.DOTALL) | ||
html = re.sub(r"<code>(.*?)</code>", " ", html, flags=re.DOTALL) | ||
soup = BeautifulSoup(html, "html.parser") | ||
|
||
if extract_headlines: | ||
text, headlines = self._extract_text_and_headlines(soup) | ||
if meta is None: | ||
meta = {} | ||
meta["headlines"] = headlines | ||
else: | ||
text = soup.get_text() | ||
|
||
document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys) | ||
return [document] | ||
|
||
# Following code snippet is copied from https://gist.github.com/lorey/eb15a7f3338f959a78cc3661fbc255fe | ||
@staticmethod | ||
def markdown_to_text(markdown_string: str) -> str: | ||
def _extract_text_and_headlines(soup: BeautifulSoup) -> Tuple[str, List[Dict]]: | ||
""" | ||
Converts a markdown string to plaintext | ||
|
||
:param markdown_string: String in markdown format | ||
Extracts text and headings from a soup object. | ||
""" | ||
# md -> html -> text since BeautifulSoup can extract text cleanly | ||
html = markdown(markdown_string) | ||
headline_tags = {"h1", "h2", "h3", "h4", "h5", "h6"} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make the depth level configurable? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or even better - find the deepest tree level in one pass of regular expressions? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are pre-defined HTML tags as listed for example here. |
||
headlines = [] | ||
text = "" | ||
for desc in soup.descendants: | ||
if desc.name in headline_tags: | ||
current_headline = desc.get_text() | ||
current_start_idx = len(text) | ||
current_level = int(desc.name[-1]) - 1 | ||
headlines.append({"headline": current_headline, "start_idx": current_start_idx, "level": current_level}) | ||
|
||
# remove code snippets | ||
html = re.sub(r"<pre>(.*?)</pre>", " ", html) | ||
html = re.sub(r"<code>(.*?)</code >", " ", html) | ||
|
||
# extract text | ||
soup = BeautifulSoup(html, "html.parser") | ||
text = "".join(soup.findAll(text=True)) | ||
if isinstance(desc, NavigableString): | ||
text += desc.get_text() | ||
|
||
return text | ||
return text, headlines |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@bogdankostic another option is
id_hash_keys = id_hash_keys or self.id_hash_keys
, but I am undecided about which one is better.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
id_hash_keys = id_hash_keys or self.id_hash_keys
would work in this case becauseid_hash_keys
is supposed to be a list, but this doesn't work with optional boolean parameters.For example, if we explicitly set
remove_code_snippets
toFalse
when callingconvert
,remove_code_snippets or self.remove_code_snippets
would evaluate to whatever valueself.remove_code_snippets
has.For consistency, I would like all these lines to have the same pattern.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great points