Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Ensure each charset is attempted only once during medai preview. #11089

Merged
merged 3 commits into from
Oct 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/11089.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a long-standing bug when attempting to preview URLs which are in the `windows-1252` character encoding.
34 changes: 28 additions & 6 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import codecs
import datetime
import errno
import fnmatch
Expand All @@ -22,7 +23,7 @@
import shutil
import sys
import traceback
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Tuple, Union
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Tuple, Union
from urllib import parse as urlparse

import attr
Expand Down Expand Up @@ -631,6 +632,14 @@ def try_remove_parent_dirs(dirs: Iterable[str]) -> None:
logger.debug("No media removed from url cache")


def _normalise_encoding(encoding: str) -> Optional[str]:
"""Use the Python codec's name as the normalised entry."""
try:
return codecs.lookup(encoding).name
except LookupError:
return None


def get_html_media_encodings(body: bytes, content_type: Optional[str]) -> Iterable[str]:
"""
Get potential encoding of the body based on the (presumably) HTML body or the content-type header.
Expand All @@ -652,30 +661,43 @@ def get_html_media_encodings(body: bytes, content_type: Optional[str]) -> Iterab
Returns:
The character encoding of the body, as a string.
"""
# There's no point in returning an encoding more than once.
attempted_encodings: Set[str] = set()

# Limit searches to the first 1kb, since it ought to be at the top.
body_start = body[:1024]

# Check if it has an encoding set in a meta tag.
match = _charset_match.search(body_start)
if match:
yield match.group(1).decode("ascii")
encoding = _normalise_encoding(match.group(1).decode("ascii"))
if encoding:
attempted_encodings.add(encoding)
yield encoding

# TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>

# Check if it has an XML document with an encoding.
match = _xml_encoding_match.match(body_start)
if match:
yield match.group(1).decode("ascii")
encoding = _normalise_encoding(match.group(1).decode("ascii"))
if encoding and encoding not in attempted_encodings:
attempted_encodings.add(encoding)
yield encoding

# Check the HTTP Content-Type header for a character set.
if content_type:
content_match = _content_type_match.match(content_type)
if content_match:
yield content_match.group(1)
encoding = _normalise_encoding(content_match.group(1))
if encoding and encoding not in attempted_encodings:
attempted_encodings.add(encoding)
yield encoding

# Finally, fallback to UTF-8, then windows-1252.
yield "utf-8"
yield "windows-1252"
for fallback in ("utf-8", "cp1252"):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cp1252 is the normalized version of windows-1252 (via codecs.lookup("windows-1252").name).

if fallback not in attempted_encodings:
yield fallback


def decode_body(
Expand Down
43 changes: 35 additions & 8 deletions tests/test_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def test_invalid_encoding2(self):
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})

def test_windows_1252(self):
"""A body which uses windows-1252, but doesn't declare that."""
"""A body which uses cp1252, but doesn't declare that."""
html = b"""
<html>
<head><title>\xf3</title></head>
Expand All @@ -333,7 +333,7 @@ def test_meta_charset(self):
""",
"text/html",
)
self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"])
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])

# A less well-formed version.
encodings = get_html_media_encodings(
Expand All @@ -345,7 +345,7 @@ def test_meta_charset(self):
""",
"text/html",
)
self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"])
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])

def test_meta_charset_underscores(self):
"""A character encoding contains underscore."""
Expand All @@ -358,7 +358,7 @@ def test_meta_charset_underscores(self):
""",
"text/html",
)
self.assertEqual(list(encodings), ["Shift_JIS", "utf-8", "windows-1252"])
self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"])

def test_xml_encoding(self):
"""A character encoding is found via the meta tag."""
Expand All @@ -370,7 +370,7 @@ def test_xml_encoding(self):
""",
"text/html",
)
self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"])
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])

def test_meta_xml_encoding(self):
"""Meta tags take precedence over XML encoding."""
Expand All @@ -384,7 +384,7 @@ def test_meta_xml_encoding(self):
""",
"text/html",
)
self.assertEqual(list(encodings), ["UTF-16", "ascii", "utf-8", "windows-1252"])
self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"])

def test_content_type(self):
"""A character encoding is found via the Content-Type header."""
Expand All @@ -399,9 +399,36 @@ def test_content_type(self):
)
for header in headers:
encodings = get_html_media_encodings(b"", header)
self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"])
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])

def test_fallback(self):
"""A character encoding cannot be found in the body or header."""
encodings = get_html_media_encodings(b"", "text/html")
self.assertEqual(list(encodings), ["utf-8", "windows-1252"])
self.assertEqual(list(encodings), ["utf-8", "cp1252"])

def test_duplicates(self):
"""Ensure each encoding is only attempted once."""
encodings = get_html_media_encodings(
b"""
<?xml version="1.0" encoding="utf8"?>
<html>
<head><meta charset="UTF-8">
</head>
</html>
""",
'text/html; charset="UTF_8"',
)
self.assertEqual(list(encodings), ["utf-8", "cp1252"])

def test_unknown_invalid(self):
"""A character encoding should be ignored if it is unknown or invalid."""
encodings = get_html_media_encodings(
b"""
<html>
<head><meta charset="invalid">
</head>
</html>
""",
'text/html; charset="invalid"',
)
self.assertEqual(list(encodings), ["utf-8", "cp1252"])