diff --git a/haystack/nodes/connector/crawler.py b/haystack/nodes/connector/crawler.py index fcf4429228..ea8cf1880e 100644 --- a/haystack/nodes/connector/crawler.py +++ b/haystack/nodes/connector/crawler.py @@ -3,12 +3,15 @@ import re import sys import json +import time import logging from pathlib import Path from urllib.parse import urlparse try: from webdriver_manager.chrome import ChromeDriverManager + from selenium.webdriver.common.by import By + from selenium.common.exceptions import StaleElementReferenceException from selenium import webdriver except (ImportError, ModuleNotFoundError) as ie: from haystack.utils.import_utils import _optional_component_not_installed @@ -48,6 +51,7 @@ def __init__( overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, + loading_wait_time: Optional[int] = None, ): """ Init object with basic params for crawling (can be overwritten later). @@ -66,6 +70,9 @@ def __init__( In this case the id will be generated by using the content and the defined metadata. :param extract_hidden_text: Whether to extract the hidden text contained in page. E.g. the text can be inside a span with style="display: none" + :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on + dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted. + E.g. 2: Crawler will wait 2 seconds before scraping page """ super().__init__() @@ -97,6 +104,7 @@ def __init__( self.overwrite_existing_files = overwrite_existing_files self.id_hash_keys = id_hash_keys self.extract_hidden_text = extract_hidden_text + self.loading_wait_time = loading_wait_time def crawl( self, @@ -107,6 +115,7 @@ def crawl( overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None, + loading_wait_time: Optional[int] = None, ) -> List[Path]: """ Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON @@ -127,6 +136,9 @@ def crawl( attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). In this case the id will be generated by using the content and the defined metadata. + :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on + dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted. + E.g. 2: Crawler will wait 2 seconds before scraping page :return: List of paths where the crawled webpages got stored """ @@ -145,6 +157,8 @@ def crawl( crawler_depth = self.crawler_depth if extract_hidden_text is None: extract_hidden_text = self.extract_hidden_text + if loading_wait_time is None: + loading_wait_time = self.loading_wait_time output_dir = Path(output_dir) if not output_dir.exists(): @@ -163,10 +177,18 @@ def crawl( for url in urls: if pattern.search(url): file_paths += self._write_to_files( - [url], output_dir=output_dir, extract_hidden_text=extract_hidden_text + [url], + output_dir=output_dir, + extract_hidden_text=extract_hidden_text, + loading_wait_time=loading_wait_time, ) else: - file_paths += self._write_to_files(urls, output_dir=output_dir, extract_hidden_text=extract_hidden_text) + file_paths += self._write_to_files( + urls, + output_dir=output_dir, + extract_hidden_text=extract_hidden_text, + loading_wait_time=loading_wait_time, + ) # follow one level of sublinks if requested if crawler_depth == 1: sub_links: Dict[str, List] = {} @@ -174,7 +196,10 @@ def crawl( already_found_links: List = list(sum(list(sub_links.values()), [])) sub_links[url_] = list( self._extract_sublinks_from_url( - base_url=url_, filter_urls=filter_urls, already_found_links=already_found_links + base_url=url_, + filter_urls=filter_urls, + already_found_links=already_found_links, + loading_wait_time=loading_wait_time, ) ) for url, extracted_sublink in sub_links.items(): @@ -184,6 +209,7 @@ def crawl( base_url=url, id_hash_keys=id_hash_keys, extract_hidden_text=extract_hidden_text, + loading_wait_time=loading_wait_time, ) return file_paths @@ -195,12 +221,15 @@ def _write_to_files( extract_hidden_text: bool, base_url: str = None, id_hash_keys: Optional[List[str]] = None, + loading_wait_time: Optional[int] = None, ) -> List[Path]: paths = [] for link in urls: logger.info(f"writing contents from `{link}`") self.driver.get(link) - el = self.driver.find_element_by_tag_name("body") + if loading_wait_time is not None: + time.sleep(loading_wait_time) + el = self.driver.find_element(by=By.TAG_NAME, value="body") if extract_hidden_text: text = el.get_attribute("textContent") else: @@ -232,6 +261,7 @@ def run( # type: ignore return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True, + loading_wait_time: Optional[int] = None, ) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]: """ Method to be executed when the Crawler is used as a Node within a Haystack pipeline. @@ -251,6 +281,9 @@ def run( # type: ignore In this case the id will be generated by using the content and the defined metadata. :param extract_hidden_text: Whether to extract the hidden text contained in page. E.g. the text can be inside a span with style="display: none" + :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on + dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted. + E.g. 2: Crawler will wait 2 seconds before scraping page :return: Tuple({"paths": List of filepaths, ...}, Name of output edge) """ @@ -262,6 +295,7 @@ def run( # type: ignore filter_urls=filter_urls, overwrite_existing_files=overwrite_existing_files, extract_hidden_text=extract_hidden_text, + loading_wait_time=loading_wait_time, ) results: Dict[str, Union[List[Document], List[Path]]] = {} if return_documents: @@ -285,6 +319,7 @@ def run_batch( # type: ignore return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True, + loading_wait_time: Optional[int] = None, ): return self.run( output_dir=output_dir, @@ -295,6 +330,7 @@ def run_batch( # type: ignore return_documents=return_documents, id_hash_keys=id_hash_keys, extract_hidden_text=extract_hidden_text, + loading_wait_time=loading_wait_time, ) @staticmethod @@ -310,25 +346,35 @@ def _is_inpage_navigation(base_url: str, sub_link: str) -> bool: return base_url_.path == sub_link_.path and base_url_.netloc == sub_link_.netloc def _extract_sublinks_from_url( - self, base_url: str, filter_urls: Optional[List] = None, already_found_links: List = None + self, + base_url: str, + filter_urls: Optional[List] = None, + already_found_links: List = None, + loading_wait_time: Optional[int] = None, ) -> set: if filter_urls: filter_pattern = re.compile("|".join(filter_urls)) self.driver.get(base_url) - a_elements = self.driver.find_elements_by_xpath("//a[@href]") + if loading_wait_time is not None: + time.sleep(loading_wait_time) + a_elements = self.driver.find_elements(by=By.XPATH, value="//a[@href]") sub_links = set() for i in a_elements: - sub_link = i.get_attribute("href") - if not (already_found_links and sub_link in already_found_links): - if self._is_internal_url(base_url=base_url, sub_link=sub_link) and ( - not self._is_inpage_navigation(base_url=base_url, sub_link=sub_link) - ): - if filter_urls: - if filter_pattern.search(sub_link): + try: + sub_link = i.get_attribute("href") + + if not (already_found_links and sub_link in already_found_links): + if self._is_internal_url(base_url=base_url, sub_link=sub_link) and ( + not self._is_inpage_navigation(base_url=base_url, sub_link=sub_link) + ): + if filter_urls: + if filter_pattern.search(sub_link): + sub_links.add(sub_link) + else: sub_links.add(sub_link) - else: - sub_links.add(sub_link) + except StaleElementReferenceException as error: + logger.error("Crawler couldn't find link, it has been removed from DOM.") return sub_links diff --git a/test/nodes/test_connector.py b/test/nodes/test_connector.py index 382c1e9bec..3afc1d0eb9 100644 --- a/test/nodes/test_connector.py +++ b/test/nodes/test_connector.py @@ -1,10 +1,13 @@ from typing import List import json +import time from pathlib import Path import pytest +from selenium.webdriver.common.by import By + from haystack.nodes.connector import Crawler from haystack.schema import Document @@ -16,14 +19,16 @@ def test_url(): return f"file://{SAMPLES_PATH.absolute()}/crawler" -def content_match(crawler: Crawler, url: str, crawled_page: Path): +def content_match(crawler: Crawler, url: str, crawled_page: Path, loading_wait_time: int = None): """ :param crawler: the tested Crawler object :param url: the URL of the expected page :param crawled_page: the output of Crawler (one element of the paths list) """ crawler.driver.get(url) - body = crawler.driver.find_element_by_tag_name("body") + if loading_wait_time is not None: + time.sleep(loading_wait_time) + body = crawler.driver.find_element(by=By.TAG_NAME, value="body") if crawler.extract_hidden_text: expected_crawled_content = body.get_attribute("textContent") @@ -35,7 +40,9 @@ def content_match(crawler: Crawler, url: str, crawled_page: Path): return page_data["content"] == expected_crawled_content -def content_in_results(crawler: Crawler, url: str, results: List[Path], expected_matches_count=1): +def content_in_results( + crawler: Crawler, url: str, results: List[Path], expected_matches_count=1, loading_wait_time: int = None +): """ Makes sure there is exactly one matching page in the list of pages returned by the crawler. @@ -45,7 +52,7 @@ def content_in_results(crawler: Crawler, url: str, results: List[Path], expected :param results: the crawler's output (list of paths) :param expected_matches_count: how many copies of this page should be present in the results (default 1) """ - return sum(content_match(crawler, url, path) for path in results) == expected_matches_count + return sum(content_match(crawler, url, path, loading_wait_time) for path in results) == expected_matches_count # @@ -133,7 +140,7 @@ def test_crawler_filter_urls(test_url, tmp_path): paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1) assert len(paths) == 1 assert content_match(crawler, test_url + "/page1.html", paths[0]) - assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google\.com"], crawler_depth=1) + assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google.com"], crawler_depth=1) def test_crawler_return_document(test_url, tmp_path): @@ -161,3 +168,17 @@ def test_crawler_extract_hidden_text(test_url, tmp_path): ) crawled_content = documents["documents"][0].content assert "hidden text" not in crawled_content + + +def test_crawler_loading_wait_time(test_url, tmp_path): + loading_wait_time = 3 + crawler = Crawler(output_dir=tmp_path) + paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=1, loading_wait_time=loading_wait_time) + + assert len(paths) == 4 + assert content_in_results( + crawler, test_url + "/page_dynamic_result.html", paths, loading_wait_time=loading_wait_time + ) + assert content_in_results(crawler, test_url + "/index.html", paths) + assert content_in_results(crawler, test_url + "/page1.html", paths) + assert content_in_results(crawler, test_url + "/page2.html", paths) diff --git a/test/samples/crawler/page_dynamic.html b/test/samples/crawler/page_dynamic.html new file mode 100644 index 0000000000..0c57978964 --- /dev/null +++ b/test/samples/crawler/page_dynamic.html @@ -0,0 +1,22 @@ + + + + Test Dynamic Page + + +

home page content

+ link to index + link to page with hidden text + link to page 1 + link to page 2 + + + \ No newline at end of file diff --git a/test/samples/crawler/page_dynamic_result.html b/test/samples/crawler/page_dynamic_result.html new file mode 100644 index 0000000000..e0c29d3131 --- /dev/null +++ b/test/samples/crawler/page_dynamic_result.html @@ -0,0 +1,22 @@ + + + + Test Dynamic Page + + +

home page content

+ link to index + + link to page 1 + link to page 2 + + + \ No newline at end of file