-
Notifications
You must be signed in to change notification settings - Fork 14.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Sitemap specify default filter url (Fixes: CVE-2023-46229) #11925
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
import itertools | ||
import re | ||
from typing import Any, Callable, Generator, Iterable, List, Optional | ||
from typing import Any, Callable, Generator, Iterable, List, Optional, Tuple | ||
from urllib.parse import urlparse | ||
|
||
from langchain.document_loaders.web_base import WebBaseLoader | ||
from langchain.schema import Document | ||
|
@@ -20,8 +21,47 @@ def _batch_block(iterable: Iterable, size: int) -> Generator[List[dict], None, N | |
yield item | ||
|
||
|
||
def _extract_scheme_and_domain(url: str) -> Tuple[str, str]: | ||
"""Extract the scheme + domain from a given URL. | ||
|
||
Args: | ||
url (str): The input URL. | ||
|
||
Returns: | ||
return a 2-tuple of scheme and domain | ||
""" | ||
parsed_uri = urlparse(url) | ||
return parsed_uri.scheme, parsed_uri.netloc | ||
|
||
|
||
class SitemapLoader(WebBaseLoader): | ||
"""Load a sitemap and its URLs.""" | ||
"""Load a sitemap and its URLs. | ||
|
||
**Security Note**: This loader can be used to load all URLs specified in a sitemap. | ||
If a malicious actor gets access to the sitemap, they could force | ||
the server to load URLs from other domains by modifying the sitemap. | ||
This could lead to server-side request forgery (SSRF) attacks; e.g., | ||
with the attacker forcing the server to load URLs from internal | ||
service endpoints that are not publicly accessible. While the attacker | ||
may not immediately gain access to this data, this data could leak | ||
into downstream systems (e.g., data loader is used to load data for indexing). | ||
|
||
This loader is a crawler and web crawlers should generally NOT be deployed | ||
with network access to any internal servers. | ||
|
||
Control access to who can submit crawling requests and what network access | ||
the crawler has. | ||
|
||
By default, the loader will only load URLs from the same domain as the sitemap | ||
if the site map is not a local file. This can be disabled by setting | ||
restrict_to_same_domain to False (not recommended). | ||
|
||
If the site map is a local file, no such risk mitigation is applied by default. | ||
|
||
Use the filter URLs argument to limit which URLs can be loaded. | ||
|
||
See https://python.langchain.com/docs/security | ||
""" | ||
|
||
def __init__( | ||
self, | ||
|
@@ -33,14 +73,22 @@ def __init__( | |
meta_function: Optional[Callable] = None, | ||
is_local: bool = False, | ||
continue_on_failure: bool = False, | ||
restrict_to_same_domain: bool = True, | ||
**kwargs: Any, | ||
): | ||
"""Initialize with webpage path and optional filter URLs. | ||
|
||
Args: | ||
web_path: url of the sitemap. can also be a local path | ||
filter_urls: list of strings or regexes that will be applied to filter the | ||
urls that are parsed and loaded | ||
filter_urls: a list of regexes. If specified, only | ||
URLS that match one of the filter URLs will be loaded. | ||
*WARNING* The filter URLs are interpreted as regular expressions. | ||
Remember to escape special characters if you do not want them to be | ||
interpreted as regular expression syntax. For example, `.` appears | ||
frequently in URLs and should be escaped if you want to match a literal | ||
`.` rather than any character. | ||
restrict_to_same_domain takes precedence over filter_urls when | ||
restrict_to_same_domain is True and the sitemap is not a local file. | ||
parsing_function: Function to parse bs4.Soup output | ||
blocksize: number of sitemap locations per block | ||
blocknum: the number of the block that should be loaded - zero indexed. | ||
|
@@ -53,6 +101,9 @@ def __init__( | |
occurs loading a url, emitting a warning instead of raising an | ||
exception. Setting this to True makes the loader more robust, but also | ||
may result in missing data. Default: False | ||
restrict_to_same_domain: whether to restrict loading to URLs to the same | ||
domain as the sitemap. Attention: This is only applied if the sitemap | ||
is not a local file! | ||
""" | ||
|
||
if blocksize is not None and blocksize < 1: | ||
|
@@ -65,12 +116,17 @@ def __init__( | |
import lxml # noqa:F401 | ||
except ImportError: | ||
raise ImportError( | ||
"lxml package not found, please install it with " "`pip install lxml`" | ||
"lxml package not found, please install it with `pip install lxml`" | ||
) | ||
|
||
super().__init__(web_paths=[web_path], **kwargs) | ||
|
||
self.filter_urls = filter_urls | ||
# Define a list of URL patterns (interpreted as regular expressions) that | ||
# will be allowed to be loaded. | ||
# restrict_to_same_domain takes precedence over filter_urls when | ||
# restrict_to_same_domain is True and the sitemap is not a local file. | ||
self.allow_url_patterns = filter_urls | ||
self.restrict_to_same_domain = restrict_to_same_domain | ||
self.parsing_function = parsing_function or _default_parsing_function | ||
self.meta_function = meta_function or _default_meta_function | ||
self.blocksize = blocksize | ||
|
@@ -96,8 +152,15 @@ def parse_sitemap(self, soup: Any) -> List[dict]: | |
# Strip leading and trailing whitespace and newlines | ||
loc_text = loc.text.strip() | ||
|
||
if self.filter_urls and not any( | ||
re.match(r, loc_text) for r in self.filter_urls | ||
if self.restrict_to_same_domain and not self.is_local: | ||
if _extract_scheme_and_domain(loc_text) != _extract_scheme_and_domain( | ||
self.web_path | ||
): | ||
continue | ||
|
||
if self.allow_url_patterns and not any( | ||
re.match(regexp_pattern, loc_text) | ||
for regexp_pattern in self.allow_url_patterns | ||
Comment on lines
+161
to
+163
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know if we care about performance at all, but this will recompile all the regexes for every visited URL which is a bit of a heavyweight operation. The regexes don't change so we could compile them once and then use the compiled forms each time. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not important in this case, can always optimize later if needed |
||
): | ||
continue | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mention the precedence in the docstring?