Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sitemap specify default filter url (Fixes: CVE-2023-46229) #11925

Merged
merged 5 commits into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 68 additions & 7 deletions libs/langchain/langchain/document_loaders/sitemap.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import itertools
import re
from typing import Any, Callable, Generator, Iterable, List, Optional
from typing import Any, Callable, Generator, Iterable, List, Optional, Tuple
from urllib.parse import urlparse

from langchain.document_loaders.web_base import WebBaseLoader
from langchain.schema import Document
Expand All @@ -20,8 +21,47 @@ def _batch_block(iterable: Iterable, size: int) -> Generator[List[dict], None, N
yield item


def _extract_scheme_and_domain(url: str) -> Tuple[str, str]:
"""Extract the scheme + domain from a given URL.

Args:
url (str): The input URL.

Returns:
return a 2-tuple of schema and domain
eyurtsev marked this conversation as resolved.
Show resolved Hide resolved
"""
parsed_uri = urlparse(url)
return parsed_uri.scheme, parsed_uri.netloc


class SitemapLoader(WebBaseLoader):
"""Load a sitemap and its URLs."""
"""Load a sitemap and its URLs.

**Security Note**: This loader can be used to load all URLs specified in a sitemap.
If a malicious actor gets access to the sitemap, they could force
the server to load URLs from other domains by modifying the sitemap.
This could lead to server-side request forgery (SSRF) attacks; e.g.,
with the attacker forcing the server to load URLs from internal
service endpoints that are not publicly accessible. While the attacker
may not immediately gain access to this data, this data could leak
into downstream systems (e.g., data loader is used to load data for indexing).

This loader is a crawler and web crawlers should generally NOT be deployed
with network access to any internal servers.

Control access to who can submit crawling requests and what network access
the crawler has.

By default, the loader will only load URLs from the same domain as the sitemap
if the site map is not a local file. This can be disabled by setting
restrict_to_same_domain to False (not recommended).

If the site map is a local file, no such risk mitigation is applied by default.

Use the filter URLs argument to limit which URLs can be loaded.

See https://python.langchain.com/docs/security
"""

def __init__(
self,
Expand All @@ -33,14 +73,20 @@ def __init__(
meta_function: Optional[Callable] = None,
is_local: bool = False,
continue_on_failure: bool = False,
restrict_to_same_domain: bool = True,
**kwargs: Any,
):
"""Initialize with webpage path and optional filter URLs.

Args:
web_path: url of the sitemap. can also be a local path
filter_urls: list of strings or regexes that will be applied to filter the
urls that are parsed and loaded
filter_urls: a list of regexes. If specified, only
URLS that match one of the filter URLs will be loaded.
*WARNING* The filter URLs are interpreted as regular expressions.
Remember to escape special characters if you do not want them to be
interpreted as regular expression syntax. For example, `.` appears
frequently in URLs and should be escaped if you want to match a literal
`.` rather than any character.
parsing_function: Function to parse bs4.Soup output
blocksize: number of sitemap locations per block
blocknum: the number of the block that should be loaded - zero indexed.
Expand All @@ -53,6 +99,9 @@ def __init__(
occurs loading a url, emitting a warning instead of raising an
exception. Setting this to True makes the loader more robust, but also
may result in missing data. Default: False
restrict_to_same_domain: whether to restrict loading to URLs to the same
domain as the sitemap. Attention: This is only applied if the sitemap
is not a local file!
"""

if blocksize is not None and blocksize < 1:
Expand All @@ -70,7 +119,12 @@ def __init__(

super().__init__(web_paths=[web_path], **kwargs)

self.filter_urls = filter_urls
# Define a list of URL patterns (interpreted as regular expressions) that
# will be allowed to be loaded.
# restrict_to_same_domain takes precedence over filter_urls when
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mention the precedence in the docstring?

# restrict_to_same_domain is True and the sitemap is not a local file.
self.allow_url_patterns = filter_urls
self.restrict_to_same_domain = restrict_to_same_domain
self.parsing_function = parsing_function or _default_parsing_function
self.meta_function = meta_function or _default_meta_function
self.blocksize = blocksize
Expand All @@ -96,8 +150,15 @@ def parse_sitemap(self, soup: Any) -> List[dict]:
# Strip leading and trailing whitespace and newlines
loc_text = loc.text.strip()

if self.filter_urls and not any(
re.match(r, loc_text) for r in self.filter_urls
if self.restrict_to_same_domain and not self.is_local:
if _extract_scheme_and_domain(loc_text) != _extract_scheme_and_domain(
self.web_path
):
continue

if self.allow_url_patterns and not any(
re.match(regexp_pattern, loc_text)
for regexp_pattern in self.allow_url_patterns
Comment on lines +161 to +163
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know if we care about performance at all, but this will recompile all the regexes for every visited URL which is a bit of a heavyweight operation. The regexes don't change so we could compile them once and then use the compiled forms each time.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not important in this case, can always optimize later if needed

):
continue

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
import pytest

from langchain.document_loaders import SitemapLoader
from langchain.document_loaders.sitemap import _extract_scheme_and_domain


def test_sitemap() -> None:
"""Test sitemap loader."""
loader = SitemapLoader("https://langchain.readthedocs.io/sitemap.xml")
loader = SitemapLoader("https://api.python.langchain.com/sitemap.xml")
documents = loader.load()
assert len(documents) > 1
assert "LangChain Python API" in documents[0].page_content
Expand All @@ -17,7 +18,7 @@ def test_sitemap() -> None:
def test_sitemap_block() -> None:
"""Test sitemap loader."""
loader = SitemapLoader(
"https://langchain.readthedocs.io/sitemap.xml", blocksize=1, blocknum=1
"https://api.python.langchain.com/sitemap.xml", blocksize=1, blocknum=1
)
documents = loader.load()
assert len(documents) == 1
Expand All @@ -27,7 +28,7 @@ def test_sitemap_block() -> None:
def test_sitemap_block_only_one() -> None:
"""Test sitemap loader."""
loader = SitemapLoader(
"https://langchain.readthedocs.io/sitemap.xml", blocksize=1000000, blocknum=0
"https://api.python.langchain.com/sitemap.xml", blocksize=1000000, blocknum=0
)
documents = loader.load()
assert len(documents) > 1
Expand All @@ -37,7 +38,7 @@ def test_sitemap_block_only_one() -> None:
def test_sitemap_block_blocknum_default() -> None:
"""Test sitemap loader."""
loader = SitemapLoader(
"https://langchain.readthedocs.io/sitemap.xml", blocksize=1000000
"https://api.python.langchain.com/sitemap.xml", blocksize=1000000
)
documents = loader.load()
assert len(documents) > 1
Expand All @@ -47,14 +48,14 @@ def test_sitemap_block_blocknum_default() -> None:
def test_sitemap_block_size_to_small() -> None:
"""Test sitemap loader."""
with pytest.raises(ValueError, match="Sitemap blocksize should be at least 1"):
SitemapLoader("https://langchain.readthedocs.io/sitemap.xml", blocksize=0)
SitemapLoader("https://api.python.langchain.com/sitemap.xml", blocksize=0)


def test_sitemap_block_num_to_small() -> None:
"""Test sitemap loader."""
with pytest.raises(ValueError, match="Sitemap blocknum can not be lower then 0"):
SitemapLoader(
"https://langchain.readthedocs.io/sitemap.xml",
"https://api.python.langchain.com/sitemap.xml",
blocksize=1000000,
blocknum=-1,
)
Expand All @@ -63,7 +64,7 @@ def test_sitemap_block_num_to_small() -> None:
def test_sitemap_block_does_not_exists() -> None:
"""Test sitemap loader."""
loader = SitemapLoader(
"https://langchain.readthedocs.io/sitemap.xml", blocksize=1000000, blocknum=15
"https://api.python.langchain.com/sitemap.xml", blocksize=1000000, blocknum=15
)
with pytest.raises(
ValueError,
Expand All @@ -75,7 +76,7 @@ def test_sitemap_block_does_not_exists() -> None:
def test_filter_sitemap() -> None:
"""Test sitemap loader."""
loader = SitemapLoader(
"https://langchain.readthedocs.io/sitemap.xml",
"https://api.python.langchain.com/sitemap.xml",
filter_urls=["https://api.python.langchain.com/en/stable/"],
)
documents = loader.load()
Expand All @@ -89,7 +90,7 @@ def sitemap_metadata_one(meta: dict, _content: None) -> dict:

"""Test sitemap loader."""
loader = SitemapLoader(
"https://langchain.readthedocs.io/sitemap.xml",
"https://api.python.langchain.com/sitemap.xml",
meta_function=sitemap_metadata_one,
)
documents = loader.load()
Expand All @@ -107,7 +108,7 @@ def sitemap_metadata_two(meta: dict, content: Any) -> dict:

"""Test sitemap loader."""
loader = SitemapLoader(
"https://langchain.readthedocs.io/sitemap.xml",
"https://api.python.langchain.com/sitemap.xml",
meta_function=sitemap_metadata_two,
)
documents = loader.load()
Expand All @@ -118,7 +119,7 @@ def sitemap_metadata_two(meta: dict, content: Any) -> dict:

def test_sitemap_metadata_default() -> None:
"""Test sitemap loader."""
loader = SitemapLoader("https://langchain.readthedocs.io/sitemap.xml")
loader = SitemapLoader("https://api.python.langchain.com/sitemap.xml")
documents = loader.load()
assert len(documents) > 1
assert "source" in documents[0].metadata
Expand All @@ -132,3 +133,23 @@ def test_local_sitemap() -> None:
documents = loader.load()
assert len(documents) > 1
assert "🦜️🔗" in documents[0].page_content


def test_extract_domain() -> None:
"""Test domain extraction."""
assert _extract_scheme_and_domain("https://js.langchain.com/sitemap.xml") == (
"https",
"js.langchain.com",
)
assert _extract_scheme_and_domain("http://example.com/path/to/page") == (
"http",
"example.com",
)
assert _extract_scheme_and_domain("ftp://files.example.com") == (
"ftp",
"files.example.com",
)
assert _extract_scheme_and_domain("https://deep.subdomain.example.com") == (
"https",
"deep.subdomain.example.com",
)
Loading