Improved crawler support for dynamically loaded pages

danielbichuetti · Jun 22, 2022 · f028331 · f028331
1 parent 325bc54
commit f028331
Show file tree

Hide file tree

Showing 4 changed files with 131 additions and 20 deletions.
diff --git a/haystack/nodes/connector/crawler.py b/haystack/nodes/connector/crawler.py
@@ -3,12 +3,15 @@
 import re
 import sys
 import json
+import time
 import logging
 from pathlib import Path
 from urllib.parse import urlparse
 
 try:
     from webdriver_manager.chrome import ChromeDriverManager
+    from selenium.webdriver.common.by import By
+    from selenium.common.exceptions import StaleElementReferenceException
     from selenium import webdriver
 except (ImportError, ModuleNotFoundError) as ie:
     from haystack.utils.import_utils import _optional_component_not_installed
@@ -48,6 +51,7 @@ def __init__(
         overwrite_existing_files=True,
         id_hash_keys: Optional[List[str]] = None,
         extract_hidden_text=True,
+        loading_wait_time: Optional[int] = None,
     ):
         """
         Init object with basic params for crawling (can be overwritten later).
@@ -66,6 +70,9 @@ def __init__(
             In this case the id will be generated by using the content and the defined metadata.
         :param extract_hidden_text: Whether to extract the hidden text contained in page.
             E.g. the text can be inside a span with style="display: none"
+        :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
+            dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
+            E.g. 2: Crawler will wait 2 seconds before scraping page
         """
         super().__init__()
 
@@ -97,6 +104,7 @@ def __init__(
         self.overwrite_existing_files = overwrite_existing_files
         self.id_hash_keys = id_hash_keys
         self.extract_hidden_text = extract_hidden_text
+        self.loading_wait_time = loading_wait_time
 
     def crawl(
         self,
@@ -107,6 +115,7 @@ def crawl(
         overwrite_existing_files: Optional[bool] = None,
         id_hash_keys: Optional[List[str]] = None,
         extract_hidden_text: Optional[bool] = None,
+        loading_wait_time: Optional[int] = None,
     ) -> List[Path]:
         """
         Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
@@ -127,6 +136,9 @@ def crawl(
             attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
             not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
             In this case the id will be generated by using the content and the defined metadata.
+        :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
+            dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
+            E.g. 2: Crawler will wait 2 seconds before scraping page
 
         :return: List of paths where the crawled webpages got stored
         """
@@ -145,6 +157,8 @@ def crawl(
             crawler_depth = self.crawler_depth
         if extract_hidden_text is None:
             extract_hidden_text = self.extract_hidden_text
+        if loading_wait_time is None:
+            loading_wait_time = self.loading_wait_time
 
         output_dir = Path(output_dir)
         if not output_dir.exists():
@@ -163,18 +177,29 @@ def crawl(
                 for url in urls:
                     if pattern.search(url):
                         file_paths += self._write_to_files(
-                            [url], output_dir=output_dir, extract_hidden_text=extract_hidden_text
+                            [url],
+                            output_dir=output_dir,
+                            extract_hidden_text=extract_hidden_text,
+                            loading_wait_time=loading_wait_time,
                         )
             else:
-                file_paths += self._write_to_files(urls, output_dir=output_dir, extract_hidden_text=extract_hidden_text)
+                file_paths += self._write_to_files(
+                    urls,
+                    output_dir=output_dir,
+                    extract_hidden_text=extract_hidden_text,
+                    loading_wait_time=loading_wait_time,
+                )
             # follow one level of sublinks if requested
             if crawler_depth == 1:
                 sub_links: Dict[str, List] = {}
                 for url_ in urls:
                     already_found_links: List = list(sum(list(sub_links.values()), []))
                     sub_links[url_] = list(
                         self._extract_sublinks_from_url(
-                            base_url=url_, filter_urls=filter_urls, already_found_links=already_found_links
+                            base_url=url_,
+                            filter_urls=filter_urls,
+                            already_found_links=already_found_links,
+                            loading_wait_time=loading_wait_time,
                         )
                     )
                 for url, extracted_sublink in sub_links.items():
@@ -184,6 +209,7 @@ def crawl(
                         base_url=url,
                         id_hash_keys=id_hash_keys,
                         extract_hidden_text=extract_hidden_text,
+                        loading_wait_time=loading_wait_time,
                     )
 
         return file_paths
@@ -195,12 +221,15 @@ def _write_to_files(
         extract_hidden_text: bool,
         base_url: str = None,
         id_hash_keys: Optional[List[str]] = None,
+        loading_wait_time: Optional[int] = None,
     ) -> List[Path]:
         paths = []
         for link in urls:
             logger.info(f"writing contents from `{link}`")
             self.driver.get(link)
-            el = self.driver.find_element_by_tag_name("body")
+            if loading_wait_time is not None:
+                time.sleep(loading_wait_time)
+            el = self.driver.find_element(by=By.TAG_NAME, value="body")
             if extract_hidden_text:
                 text = el.get_attribute("textContent")
             else:
@@ -232,6 +261,7 @@ def run(  # type: ignore
         return_documents: Optional[bool] = False,
         id_hash_keys: Optional[List[str]] = None,
         extract_hidden_text: Optional[bool] = True,
+        loading_wait_time: Optional[int] = None,
     ) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]:
         """
         Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
@@ -251,6 +281,9 @@ def run(  # type: ignore
             In this case the id will be generated by using the content and the defined metadata.
         :param extract_hidden_text: Whether to extract the hidden text contained in page.
             E.g. the text can be inside a span with style="display: none"
+        :param loading_wait_time: Seconds to wait for page loading before scraping. Recommended when page relies on
+            dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
+            E.g. 2: Crawler will wait 2 seconds before scraping page
 
         :return: Tuple({"paths": List of filepaths, ...}, Name of output edge)
         """
@@ -262,6 +295,7 @@ def run(  # type: ignore
             filter_urls=filter_urls,
             overwrite_existing_files=overwrite_existing_files,
             extract_hidden_text=extract_hidden_text,
+            loading_wait_time=loading_wait_time,
         )
         results: Dict[str, Union[List[Document], List[Path]]] = {}
         if return_documents:
@@ -285,6 +319,7 @@ def run_batch(  # type: ignore
         return_documents: Optional[bool] = False,
         id_hash_keys: Optional[List[str]] = None,
         extract_hidden_text: Optional[bool] = True,
+        loading_wait_time: Optional[int] = None,
     ):
         return self.run(
             output_dir=output_dir,
@@ -295,6 +330,7 @@ def run_batch(  # type: ignore
             return_documents=return_documents,
             id_hash_keys=id_hash_keys,
             extract_hidden_text=extract_hidden_text,
+            loading_wait_time=loading_wait_time,
         )
 
     @staticmethod
@@ -310,25 +346,35 @@ def _is_inpage_navigation(base_url: str, sub_link: str) -> bool:
         return base_url_.path == sub_link_.path and base_url_.netloc == sub_link_.netloc
 
     def _extract_sublinks_from_url(
-        self, base_url: str, filter_urls: Optional[List] = None, already_found_links: List = None
+        self,
+        base_url: str,
+        filter_urls: Optional[List] = None,
+        already_found_links: List = None,
+        loading_wait_time: Optional[int] = None,
     ) -> set:
         if filter_urls:
             filter_pattern = re.compile("|".join(filter_urls))
 
         self.driver.get(base_url)
-        a_elements = self.driver.find_elements_by_xpath("//a[@href]")
+        if loading_wait_time is not None:
+            time.sleep(loading_wait_time)
+        a_elements = self.driver.find_elements(by=By.XPATH, value="//a[@href]")
         sub_links = set()
 
         for i in a_elements:
-            sub_link = i.get_attribute("href")
-            if not (already_found_links and sub_link in already_found_links):
-                if self._is_internal_url(base_url=base_url, sub_link=sub_link) and (
-                    not self._is_inpage_navigation(base_url=base_url, sub_link=sub_link)
-                ):
-                    if filter_urls:
-                        if filter_pattern.search(sub_link):
+            try:
+                sub_link = i.get_attribute("href")
+
+                if not (already_found_links and sub_link in already_found_links):
+                    if self._is_internal_url(base_url=base_url, sub_link=sub_link) and (
+                        not self._is_inpage_navigation(base_url=base_url, sub_link=sub_link)
+                    ):
+                        if filter_urls:
+                            if filter_pattern.search(sub_link):
+                                sub_links.add(sub_link)
+                        else:
                             sub_links.add(sub_link)
-                    else:
-                        sub_links.add(sub_link)
+            except StaleElementReferenceException as error:
+                logger.error("Crawler couldn't find link, it has been removed from DOM.")
 
         return sub_links
diff --git a/test/nodes/test_connector.py b/test/nodes/test_connector.py
@@ -1,10 +1,13 @@
 from typing import List
 
 import json
+import time
 from pathlib import Path
 
 import pytest
 
+from selenium.webdriver.common.by import By
+
 from haystack.nodes.connector import Crawler
 from haystack.schema import Document
 
@@ -16,14 +19,16 @@ def test_url():
     return f"file://{SAMPLES_PATH.absolute()}/crawler"
 
 
-def content_match(crawler: Crawler, url: str, crawled_page: Path):
+def content_match(crawler: Crawler, url: str, crawled_page: Path, loading_wait_time: int = None):
     """
     :param crawler: the tested Crawler object
     :param url: the URL of the expected page
     :param crawled_page: the output of Crawler (one element of the paths list)
     """
     crawler.driver.get(url)
-    body = crawler.driver.find_element_by_tag_name("body")
+    if loading_wait_time is not None:
+        time.sleep(loading_wait_time)
+    body = crawler.driver.find_element(by=By.TAG_NAME, value="body")
 
     if crawler.extract_hidden_text:
         expected_crawled_content = body.get_attribute("textContent")
@@ -35,7 +40,9 @@ def content_match(crawler: Crawler, url: str, crawled_page: Path):
         return page_data["content"] == expected_crawled_content
 
 
-def content_in_results(crawler: Crawler, url: str, results: List[Path], expected_matches_count=1):
+def content_in_results(
+    crawler: Crawler, url: str, results: List[Path], expected_matches_count=1, loading_wait_time: int = None
+):
     """
     Makes sure there is exactly one matching page in the list of pages returned
     by the crawler.
@@ -45,7 +52,7 @@ def content_in_results(crawler: Crawler, url: str, results: List[Path], expected
     :param results: the crawler's output (list of paths)
     :param expected_matches_count: how many copies of this page should be present in the results (default 1)
     """
-    return sum(content_match(crawler, url, path) for path in results) == expected_matches_count
+    return sum(content_match(crawler, url, path, loading_wait_time) for path in results) == expected_matches_count
 
 
 #
@@ -133,7 +140,7 @@ def test_crawler_filter_urls(test_url, tmp_path):
     paths = crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["page1"], crawler_depth=1)
     assert len(paths) == 1
     assert content_match(crawler, test_url + "/page1.html", paths[0])
-    assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google\.com"], crawler_depth=1)
+    assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google.com"], crawler_depth=1)
 
 
 def test_crawler_return_document(test_url, tmp_path):
@@ -161,3 +168,17 @@ def test_crawler_extract_hidden_text(test_url, tmp_path):
     )
     crawled_content = documents["documents"][0].content
     assert "hidden text" not in crawled_content
+
+
+def test_crawler_loading_wait_time(test_url, tmp_path):
+    loading_wait_time = 3
+    crawler = Crawler(output_dir=tmp_path)
+    paths = crawler.crawl(urls=[test_url + "/page_dynamic.html"], crawler_depth=1, loading_wait_time=loading_wait_time)
+
+    assert len(paths) == 4
+    assert content_in_results(
+        crawler, test_url + "/page_dynamic_result.html", paths, loading_wait_time=loading_wait_time
+    )
+    assert content_in_results(crawler, test_url + "/index.html", paths)
+    assert content_in_results(crawler, test_url + "/page1.html", paths)
+    assert content_in_results(crawler, test_url + "/page2.html", paths)
diff --git a/test/samples/crawler/page_dynamic.html b/test/samples/crawler/page_dynamic.html
@@ -0,0 +1,22 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Test Dynamic Page</title>   
+</head>
+<body>
+    <p>home page content</p>
+    <a href="index.html" id="a1">link to index</a>
+    <a href="page_w_hidden_text.html" id="a2">link to page with hidden text</a>
+    <a href="page1.html" id="a3">link to page 1</a>
+    <a href="page2.html" id="a4">link to page 2</a>
+    <script>
+      const removeTimeout = setTimeout(myRemoveFunction, 150);
+
+      function myRemoveFunction() {
+        const elem = document.querySelector('#a2')
+        elem.parentNode.removeChild(elem)
+        clearTimeout(removeTimeout);
+      }
+    </script>
+</body>
+</html>
diff --git a/test/samples/crawler/page_dynamic_result.html b/test/samples/crawler/page_dynamic_result.html
@@ -0,0 +1,22 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Test Dynamic Page</title>   
+</head>
+<body>
+    <p>home page content</p>
+    <a href="index.html" id="a1">link to index</a>
+
+    <a href="page1.html" id="a3">link to page 1</a>
+    <a href="page2.html" id="a4">link to page 2</a>
+    <script>
+      const removeTimeout = setTimeout(myRemoveFunction, 150);
+
+      function myRemoveFunction() {
+        const elem = document.querySelector('#a2')
+        elem.parentNode.removeChild(elem)
+        clearTimeout(removeTimeout);
+      }
+    </script>
+</body>
+</html>