Skip to content

Commit

Permalink
Avoid trying to parse strings that cannot be IP Addresses (#1104)
Browse files Browse the repository at this point in the history
  • Loading branch information
bdraco committed Sep 5, 2024
1 parent bccb8af commit 51541ec
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 20 deletions.
3 changes: 3 additions & 0 deletions CHANGES/1104.misc.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Improved performance of encoding hosts -- by :user:`bdraco`.

Previously, the library would unconditionally try to parse a host as an IP Address. The library now avoids trying to parse a host as an IP Address if the string is not in one of the formats described in :rfc:`3986#section-3.2.2`.
15 changes: 15 additions & 0 deletions tests/test_url_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,21 @@ def test_build_simple():
assert str(u) == "http://127.0.0.1"


def test_url_build_ipv6():
u = URL.build(scheme="http", host="::1")
assert str(u) == "http://::1"


def test_url_build_ipv6_brackets():
u = URL.build(scheme="http", host="[::1]")
assert str(u) == "http://::1"


def test_url_ipv4_in_ipv6():
u = URL.build(scheme="http", host="2001:db8:122:344::192.0.2.33")
assert str(u) == "http://2001:db8:122:344::c000:221"


def test_build_with_scheme():
u = URL.build(scheme="blob", path="path")
assert str(u) == "blob:path"
Expand Down
58 changes: 38 additions & 20 deletions yarl/_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -850,27 +850,45 @@ def _normalize_path(cls, path: str) -> str:
@classmethod
def _encode_host(cls, host: str, human: bool = False) -> str:
raw_ip, sep, zone = host.partition("%")
# IP parsing is slow, so its wrapped in an LRU
try:
ip_compressed_version = _ip_compressed_version(raw_ip)
except ValueError:
host = host.lower()
# IDNA encoding is slow,
# skip it for ASCII-only strings
# Don't move the check into _idna_encode() helper
# to reduce the cache size
if human or host.isascii():
if raw_ip and raw_ip[-1].isdigit() or ":" in raw_ip:
# Might be an IP address, check it
#
# IP Addresses can look like:
# https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.2
# - 127.0.0.1 (last character is a digit)
# - 2001:db8::ff00:42:8329 (contains a colon)
# - 2001:db8::ff00:42:8329%eth0 (contains a colon)
# - [2001:db8::ff00:42:8329] (contains a colon)
# Rare IP Address formats are not supported per:
# https://datatracker.ietf.org/doc/html/rfc3986#section-7.4
#
# We try to avoid parsing IP addresses as much as possible
# since its orders of magnitude slower than almost any other operation
# this library does.
#
# IP parsing is slow, so its wrapped in an LRU
try:
ip_compressed_version = _ip_compressed_version(raw_ip)
except ValueError:
pass
else:
# These checks should not happen in the
# LRU to keep the cache size small
host, version = ip_compressed_version
if sep:
host += "%" + zone
if version == 6:
return f"[{host}]"
return host
return _idna_encode(host)

# These checks should not happen in the
# LRU to keep the cache size small
host, version = ip_compressed_version
if sep:
host += "%" + zone
if version == 6:
return f"[{host}]"
return host

host = host.lower()
# IDNA encoding is slow,
# skip it for ASCII-only strings
# Don't move the check into _idna_encode() helper
# to reduce the cache size
if human or host.isascii():
return host
return _idna_encode(host)

@classmethod
def _make_netloc(
Expand Down

0 comments on commit 51541ec

Please sign in to comment.