Skip to content

Commit

Permalink
Improved crawler support for dynamically loaded pages
Browse files Browse the repository at this point in the history
  • Loading branch information
danielbichuetti committed Jun 22, 2022
1 parent 325bc54 commit f028331
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 20 deletions.
76 changes: 61 additions & 15 deletions haystack/nodes/connector/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@
import re
import sys
import json
import time
import logging
from pathlib import Path
from urllib.parse import urlparse

try:
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException
from selenium import webdriver
except (ImportError, ModuleNotFoundError) as ie:
from haystack.utils.import_utils import _optional_component_not_installed
Expand Down Expand Up @@ -48,6 +51,7 @@ def __init__(
overwrite_existing_files=True,
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text=True,
loading_wait_time: Optional[int] = None,
):
"""
Init object with basic params for crawling (can be overwritten later).
Expand All @@ -66,6 +70,9 @@ def __init__(
In this case the id will be generated by using the content and the defined metadata.
:param extract_hidden_text: Whether to extract the hidden text contained in page.
E.g. the text can be inside a span with style="display: none"
:param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page
"""
super().__init__()

Expand Down Expand Up @@ -97,6 +104,7 @@ def __init__(
self.overwrite_existing_files = overwrite_existing_files
self.id_hash_keys = id_hash_keys
self.extract_hidden_text = extract_hidden_text
self.loading_wait_time = loading_wait_time

def crawl(
self,
Expand All @@ -107,6 +115,7 @@ def crawl(
overwrite_existing_files: Optional[bool] = None,
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = None,
loading_wait_time: Optional[int] = None,
) -> List[Path]:
"""
Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
Expand All @@ -127,6 +136,9 @@ def crawl(
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
:param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page
:return: List of paths where the crawled webpages got stored
"""
Expand All @@ -145,6 +157,8 @@ def crawl(
crawler_depth = self.crawler_depth
if extract_hidden_text is None:
extract_hidden_text = self.extract_hidden_text
if loading_wait_time is None:
loading_wait_time = self.loading_wait_time

output_dir = Path(output_dir)
if not output_dir.exists():
Expand All @@ -163,18 +177,29 @@ def crawl(
for url in urls:
if pattern.search(url):
file_paths += self._write_to_files(
[url], output_dir=output_dir, extract_hidden_text=extract_hidden_text
[url],
output_dir=output_dir,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
)
else:
file_paths += self._write_to_files(urls, output_dir=output_dir, extract_hidden_text=extract_hidden_text)
file_paths += self._write_to_files(
urls,
output_dir=output_dir,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
)
# follow one level of sublinks if requested
if crawler_depth == 1:
sub_links: Dict[str, List] = {}
for url_ in urls:
already_found_links: List = list(sum(list(sub_links.values()), []))
sub_links[url_] = list(
self._extract_sublinks_from_url(
base_url=url_, filter_urls=filter_urls, already_found_links=already_found_links
base_url=url_,
filter_urls=filter_urls,
already_found_links=already_found_links,
loading_wait_time=loading_wait_time,
)
)
for url, extracted_sublink in sub_links.items():
Expand All @@ -184,6 +209,7 @@ def crawl(
base_url=url,
id_hash_keys=id_hash_keys,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
)

return file_paths
Expand All @@ -195,12 +221,15 @@ def _write_to_files(
extract_hidden_text: bool,
base_url: str = None,
id_hash_keys: Optional[List[str]] = None,
loading_wait_time: Optional[int] = None,
) -> List[Path]:
paths = []
for link in urls:
logger.info(f"writing contents from `{link}`")
self.driver.get(link)
el = self.driver.find_element_by_tag_name("body")
if loading_wait_time is not None:
time.sleep(loading_wait_time)
el = self.driver.find_element(by=By.TAG_NAME, value="body")
if extract_hidden_text:
text = el.get_attribute("textContent")
else:
Expand Down Expand Up @@ -232,6 +261,7 @@ def run( # type: ignore
return_documents: Optional[bool] = False,
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = True,
loading_wait_time: Optional[int] = None,
) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]:
"""
Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
Expand All @@ -251,6 +281,9 @@ def run( # type: ignore
In this case the id will be generated by using the content and the defined metadata.
:param extract_hidden_text: Whether to extract the hidden text contained in page.
E.g. the text can be inside a span with style="display: none"
:param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
E.g. 2: Crawler will wait 2 seconds before scraping page
:return: Tuple({"paths": List of filepaths, ...}, Name of output edge)
"""
Expand All @@ -262,6 +295,7 @@ def run( # type: ignore
filter_urls=filter_urls,
overwrite_existing_files=overwrite_existing_files,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
)
results: Dict[str, Union[List[Document], List[Path]]] = {}
if return_documents:
Expand All @@ -285,6 +319,7 @@ def run_batch( # type: ignore
return_documents: Optional[bool] = False,
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = True,
loading_wait_time: Optional[int] = None,
):
return self.run(
output_dir=output_dir,
Expand All @@ -295,6 +330,7 @@ def run_batch( # type: ignore
return_documents=return_documents,
id_hash_keys=id_hash_keys,
extract_hidden_text=extract_hidden_text,
loading_wait_time=loading_wait_time,
)

@staticmethod
Expand All @@ -310,25 +346,35 @@ def _is_inpage_navigation(base_url: str, sub_link: str) -> bool:
return base_url_.path == sub_link_.path and base_url_.netloc == sub_link_.netloc

def _extract_sublinks_from_url(
self, base_url: str, filter_urls: Optional[List] = None, already_found_links: List = None
self,
base_url: str,
filter_urls: Optional[List] = None,
already_found_links: List = None,
loading_wait_time: Optional[int] = None,
) -> set:
if filter_urls:
filter_pattern = re.compile("|".join(filter_urls))

self.driver.get(base_url)
a_elements = self.driver.find_elements_by_xpath("//a[@href]")
if loading_wait_time is not None:
time.sleep(loading_wait_time)
a_elements = self.driver.find_elements(by=By.XPATH, value="//a[@href]")
sub_links = set()

for i in a_elements:
sub_link = i.get_attribute("href")
if not (already_found_links and sub_link in already_found_links):
if self._is_internal_url(base_url=base_url, sub_link=sub_link) and (
not self._is_inpage_navigation(base_url=base_url, sub_link=sub_link)
):
if filter_urls:
if filter_pattern.search(sub_link):
try:
sub_link = i.get_attribute("href")

if not (already_found_links and sub_link in already_found_links):
if self._is_internal_url(base_url=base_url, sub_link=sub_link) and (
not self._is_inpage_navigation(base_url=base_url, sub_link=sub_link)
):
if filter_urls:
if filter_pattern.search(sub_link):
sub_links.add(sub_link)
else:
sub_links.add(sub_link)
else:
sub_links.add(sub_link)
except StaleElementReferenceException as error:
logger.error("Crawler couldn't find link, it has been removed from DOM.")

return sub_links
31 changes: 26 additions & 5 deletions test/nodes/test_connector.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from typing import List

import json
import time
from pathlib import Path

import pytest

from selenium.webdriver.common.by import By

from haystack.nodes.connector import Crawler
from haystack.schema import Document

Expand All @@ -16,14 +19,16 @@ def test_url():
return f"file://{SAMPLES_PATH.absolute()}/crawler"


def content_match(crawler: Crawler, url: str, crawled_page: Path):
def content_match(crawler: Crawler, url: str, crawled_page: Path, loading_wait_time: int = None):
"""
:param crawler: the tested Crawler object
:param url: the URL of the expected page
:param crawled_page: the output of Crawler (one element of the paths list)
"""
crawler.driver.get(url)
body = crawler.driver.find_element_by_tag_name("body")
if loading_wait_time is not None:
time.sleep(loading_wait_time)
body = crawler.driver.find_element(by=By.TAG_NAME, value="body")

if crawler.extract_hidden_text:
expected_crawled_content = body.get_attribute("textContent")
Expand All @@ -35,7 +40,9 @@ def content_match(crawler: Crawler, url: str, crawled_page: Path):
return page_data["content"] == expected_crawled_content


def content_in_results(crawler: Crawler, url: str, results: List[Path], expected_matches_count=1):
def content_in_results(
crawler: Crawler, url: str, results: List[Path], expected_matches_count=1, loading_wait_time: int = None
):
"""
Makes sure there is exactly one matching page in the list of pages returned
by the crawler.
Expand All @@ -45,7 +52,7 @@ def content_in_results(crawler: Crawler, url: str, results: List[Path], expected
:param results: the crawler's output (list of paths)
:param expected_matches_count: how many copies of this page should be present in the results (default 1)
"""
return sum(content_match(crawler, url, path) for path in results) == expected_matches_count
return sum(content_match(crawler, url, path, loading_wait_time) for path in results) == expected_matches_count


#
Expand Down Expand Up @@ -133,7 +140,7 @@ def test_crawler_filter_urls(test_url, tmp_path):
paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1)
assert len(paths) == 1
assert content_match(crawler, test_url + "/page1.html", paths[0])
assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google\.com"], crawler_depth=1)
assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google.com"], crawler_depth=1)


def test_crawler_return_document(test_url, tmp_path):
Expand Down Expand Up @@ -161,3 +168,17 @@ def test_crawler_extract_hidden_text(test_url, tmp_path):
)
crawled_content = documents["documents"][0].content
assert "hidden text" not in crawled_content


def test_crawler_loading_wait_time(test_url, tmp_path):
loading_wait_time = 3
crawler = Crawler(output_dir=tmp_path)
paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=1, loading_wait_time=loading_wait_time)

assert len(paths) == 4
assert content_in_results(
crawler, test_url + "/page_dynamic_result.html", paths, loading_wait_time=loading_wait_time
)
assert content_in_results(crawler, test_url + "/index.html", paths)
assert content_in_results(crawler, test_url + "/page1.html", paths)
assert content_in_results(crawler, test_url + "/page2.html", paths)
22 changes: 22 additions & 0 deletions test/samples/crawler/page_dynamic.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<!DOCTYPE html>
<html>
<head>
<title>Test Dynamic Page</title>
</head>
<body>
<p>home page content</p>
<a href="index.html" id="a1">link to index</a>
<a href="page_w_hidden_text.html" id="a2">link to page with hidden text</a>
<a href="page1.html" id="a3">link to page 1</a>
<a href="page2.html" id="a4">link to page 2</a>
<script>
const removeTimeout = setTimeout(myRemoveFunction, 150);

function myRemoveFunction() {
const elem = document.querySelector('#a2')
elem.parentNode.removeChild(elem)
clearTimeout(removeTimeout);
}
</script>
</body>
</html>
22 changes: 22 additions & 0 deletions test/samples/crawler/page_dynamic_result.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<!DOCTYPE html>
<html>
<head>
<title>Test Dynamic Page</title>
</head>
<body>
<p>home page content</p>
<a href="index.html" id="a1">link to index</a>

<a href="page1.html" id="a3">link to page 1</a>
<a href="page2.html" id="a4">link to page 2</a>
<script>
const removeTimeout = setTimeout(myRemoveFunction, 150);

function myRemoveFunction() {
const elem = document.querySelector('#a2')
elem.parentNode.removeChild(elem)
clearTimeout(removeTimeout);
}
</script>
</body>
</html>

0 comments on commit f028331

Please sign in to comment.