Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Improve URL previews for some pages #12951

Merged
merged 6 commits into from
Jun 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/12951.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improve URL previews for pages with empty elements.
52 changes: 35 additions & 17 deletions synapse/rest/media/v1/preview_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
)
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)

# Certain elements aren't meant for display.
ARIA_ROLES_TO_IGNORE = {"directory", "menu", "menubar", "toolbar"}


def _normalise_encoding(encoding: str) -> Optional[str]:
"""Use the Python codec's name as the normalised entry."""
Expand Down Expand Up @@ -174,13 +177,15 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",

og: Dict[str, Optional[str]] = {}
for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
if "content" in tag.attrib:
# if we've got more than 50 tags, someone is taking the piss
if len(og) >= 50:
logger.warning("Skipping OG for page with too many 'og:' tags")
return {}
og[tag.attrib["property"]] = tag.attrib["content"]
for tag in tree.xpath(
"//*/meta[starts-with(@property, 'og:')][@content][not(@content='')]"
):
# if we've got more than 50 tags, someone is taking the piss
if len(og) >= 50:
logger.warning("Skipping OG for page with too many 'og:' tags")
return {}

og[tag.attrib["property"]] = tag.attrib["content"]

# TODO: grab article: meta tags too, e.g.:

Expand All @@ -192,21 +197,23 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
# "article:modified_time" content="2016-04-01T18:31:53+00:00" />

if "og:title" not in og:
# do some basic spidering of the HTML
title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
if title and title[0].text is not None:
og["og:title"] = title[0].text.strip()
# Attempt to find a title from the title tag, or the biggest header on the page.
title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
if title:
og["og:title"] = title[0].strip()
else:
og["og:title"] = None

if "og:image" not in og:
# TODO: extract a favicon failing all else
meta_image = tree.xpath(
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
)
# If a meta image is found, use it.
if meta_image:
og["og:image"] = meta_image[0]
else:
# Try to find images which are larger than 10px by 10px.
#
# TODO: consider inlined CSS styles as well as width & height attribs
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
images = sorted(
Expand All @@ -215,17 +222,24 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
-1 * float(i.attrib["width"]) * float(i.attrib["height"])
),
)
# If no images were found, try to find *any* images.
if not images:
images = tree.xpath("//img[@src]")
images = tree.xpath("//img[@src][1]")
if images:
og["og:image"] = images[0].attrib["src"]

# Finally, fallback to the favicon if nothing else.
else:
favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]")
if favicons:
og["og:image"] = favicons[0]

if "og:description" not in og:
# Check the first meta description tag for content.
meta_description = tree.xpath(
"//*/meta"
"[translate(@name, 'DESCRIPTION', 'description')='description']"
"/@content"
"//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
)
# If a meta description is found with content, use it.
if meta_description:
og["og:description"] = meta_description[0]
else:
Expand Down Expand Up @@ -306,6 +320,10 @@ def _iterate_over_text(
if isinstance(el, str):
yield el
elif el.tag not in tags_to_ignore:
# If the element isn't meant for display, ignore it.
if el.get("role") in ARIA_ROLES_TO_IGNORE:
continue

# el.text is the text before the first child, so we can immediately
# return it if the text exists.
if el.text:
Expand Down
37 changes: 36 additions & 1 deletion tests/rest/media/v1/test_html_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def test_small_then_large_summarize(self) -> None:
)


class CalcOgTestCase(unittest.TestCase):
class OpenGraphFromHtmlTestCase(unittest.TestCase):
if not lxml:
skip = "url preview feature requires lxml"

Expand Down Expand Up @@ -235,6 +235,21 @@ def test_missing_title(self) -> None:

self.assertEqual(og, {"og:title": None, "og:description": "Some text."})

# Another variant is a title with no content.
html = b"""
<html>
<head><title></title></head>
<body>
<h1>Title</h1>
</body>
</html>
"""

tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)

self.assertEqual(og, {"og:title": "Title", "og:description": "Title"})

def test_h1_as_title(self) -> None:
html = b"""
<html>
Expand All @@ -250,6 +265,26 @@ def test_h1_as_title(self) -> None:

self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})

def test_empty_description(self) -> None:
"""Description tags with empty content should be ignored."""
html = b"""
<html>
<meta property="og:description" content=""/>
<meta property="og:description"/>
<meta name="description" content=""/>
<meta name="description"/>
<meta name="description" content="Finally!"/>
<body>
<h1>Title</h1>
</body>
</html>
"""

tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)

self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"})

def test_missing_title_and_broken_h1(self) -> None:
html = b"""
<html>
Expand Down