Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Support oEmbed for media previews. (#7920)
Browse files Browse the repository at this point in the history
Fixes previews of Twitter URLs by using their oEmbed endpoint to grab content.
  • Loading branch information
clokep authored Jul 27, 2020
1 parent b975fa2 commit 3fc8fdd
Show file tree
Hide file tree
Showing 3 changed files with 355 additions and 53 deletions.
1 change: 1 addition & 0 deletions changelog.d/7920.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Support oEmbed for media previews.
265 changes: 220 additions & 45 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from typing import Dict, Optional
from urllib import parse as urlparse

import attr
from canonicaljson import json

from twisted.internet import defer
Expand Down Expand Up @@ -56,6 +57,65 @@
OG_TAG_NAME_MAXLEN = 50
OG_TAG_VALUE_MAXLEN = 1000

ONE_HOUR = 60 * 60 * 1000

# A map of globs to API endpoints.
_oembed_globs = {
# Twitter.
"https://publish.twitter.com/oembed": [
"https://twitter.com/*/status/*",
"https://*.twitter.com/*/status/*",
"https://twitter.com/*/moments/*",
"https://*.twitter.com/*/moments/*",
# Include the HTTP versions too.
"http://twitter.com/*/status/*",
"http://*.twitter.com/*/status/*",
"http://twitter.com/*/moments/*",
"http://*.twitter.com/*/moments/*",
],
}
# Convert the globs to regular expressions.
_oembed_patterns = {}
for endpoint, globs in _oembed_globs.items():
for glob in globs:
# Convert the glob into a sane regular expression to match against. The
# rules followed will be slightly different for the domain portion vs.
# the rest.
#
# 1. The scheme must be one of HTTP / HTTPS (and have no globs).
# 2. The domain can have globs, but we limit it to characters that can
# reasonably be a domain part.
# TODO: This does not attempt to handle Unicode domain names.
# 3. Other parts allow a glob to be any one, or more, characters.
results = urlparse.urlparse(glob)

# Ensure the scheme does not have wildcards (and is a sane scheme).
if results.scheme not in {"http", "https"}:
raise ValueError("Insecure oEmbed glob scheme: %s" % (results.scheme,))

pattern = urlparse.urlunparse(
[
results.scheme,
re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"),
]
+ [re.escape(part).replace("\\*", ".+") for part in results[2:]]
)
_oembed_patterns[re.compile(pattern)] = endpoint


@attr.s
class OEmbedResult:
# Either HTML content or URL must be provided.
html = attr.ib(type=Optional[str])
url = attr.ib(type=Optional[str])
title = attr.ib(type=Optional[str])
# Number of seconds to cache the content.
cache_age = attr.ib(type=int)


class OEmbedError(Exception):
"""An error occurred processing the oEmbed object."""


class PreviewUrlResource(DirectServeJsonResource):
isLeaf = True
Expand Down Expand Up @@ -99,7 +159,7 @@ def __init__(self, hs, media_repo, media_storage):
cache_name="url_previews",
clock=self.clock,
# don't spider URLs more often than once an hour
expiry_ms=60 * 60 * 1000,
expiry_ms=ONE_HOUR,
)

if self._worker_run_media_background_jobs:
Expand Down Expand Up @@ -310,6 +370,87 @@ async def _do_preview(self, url, user, ts):

return jsonog.encode("utf8")

def _get_oembed_url(self, url: str) -> Optional[str]:
"""
Check whether the URL should be downloaded as oEmbed content instead.
Params:
url: The URL to check.
Returns:
A URL to use instead or None if the original URL should be used.
"""
for url_pattern, endpoint in _oembed_patterns.items():
if url_pattern.fullmatch(url):
return endpoint

# No match.
return None

async def _get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult:
"""
Request content from an oEmbed endpoint.
Params:
endpoint: The oEmbed API endpoint.
url: The URL to pass to the API.
Returns:
An object representing the metadata returned.
Raises:
OEmbedError if fetching or parsing of the oEmbed information fails.
"""
try:
logger.debug("Trying to get oEmbed content for url '%s'", url)
result = await self.client.get_json(
endpoint,
# TODO Specify max height / width.
# Note that only the JSON format is supported.
args={"url": url},
)

# Ensure there's a version of 1.0.
if result.get("version") != "1.0":
raise OEmbedError("Invalid version: %s" % (result.get("version"),))

oembed_type = result.get("type")

# Ensure the cache age is None or an int.
cache_age = result.get("cache_age")
if cache_age:
cache_age = int(cache_age)

oembed_result = OEmbedResult(None, None, result.get("title"), cache_age)

# HTML content.
if oembed_type == "rich":
oembed_result.html = result.get("html")
return oembed_result

if oembed_type == "photo":
oembed_result.url = result.get("url")
return oembed_result

# TODO Handle link and video types.

if "thumbnail_url" in result:
oembed_result.url = result.get("thumbnail_url")
return oembed_result

raise OEmbedError("Incompatible oEmbed information.")

except OEmbedError as e:
# Trap OEmbedErrors first so we can directly re-raise them.
logger.warning("Error parsing oEmbed metadata from %s: %r", url, e)
raise

except Exception as e:
# Trap any exception and let the code follow as usual.
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading oEmbed metadata from %s: %r", url, e)
raise OEmbedError() from e

async def _download_url(self, url, user):
# TODO: we should probably honour robots.txt... except in practice
# we're most likely being explicitly triggered by a human rather than a
Expand All @@ -319,54 +460,90 @@ async def _download_url(self, url, user):

file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)

with self.media_storage.store_into_file(file_info) as (f, fname, finish):
# If this URL can be accessed via oEmbed, use that instead.
url_to_download = url
oembed_url = self._get_oembed_url(url)
if oembed_url:
# The result might be a new URL to download, or it might be HTML content.
try:
logger.debug("Trying to get preview for url '%s'", url)
length, headers, uri, code = await self.client.get_file(
url,
output_stream=f,
max_size=self.max_spider_size,
headers={"Accept-Language": self.url_preview_accept_language},
)
except SynapseError:
# Pass SynapseErrors through directly, so that the servlet
# handler will return a SynapseError to the client instead of
# blank data or a 500.
raise
except DNSLookupError:
# DNS lookup returned no results
# Note: This will also be the case if one of the resolved IP
# addresses is blacklisted
raise SynapseError(
502,
"DNS resolution failure during URL preview generation",
Codes.UNKNOWN,
)
except Exception as e:
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading %s: %r", url, e)
oembed_result = await self._get_oembed_content(oembed_url, url)
if oembed_result.url:
url_to_download = oembed_result.url
elif oembed_result.html:
url_to_download = None
except OEmbedError:
# If an error occurs, try doing a normal preview.
pass

raise SynapseError(
500,
"Failed to download content: %s"
% (traceback.format_exception_only(sys.exc_info()[0], e),),
Codes.UNKNOWN,
)
await finish()
if url_to_download:
with self.media_storage.store_into_file(file_info) as (f, fname, finish):
try:
logger.debug("Trying to get preview for url '%s'", url_to_download)
length, headers, uri, code = await self.client.get_file(
url_to_download,
output_stream=f,
max_size=self.max_spider_size,
headers={"Accept-Language": self.url_preview_accept_language},
)
except SynapseError:
# Pass SynapseErrors through directly, so that the servlet
# handler will return a SynapseError to the client instead of
# blank data or a 500.
raise
except DNSLookupError:
# DNS lookup returned no results
# Note: This will also be the case if one of the resolved IP
# addresses is blacklisted
raise SynapseError(
502,
"DNS resolution failure during URL preview generation",
Codes.UNKNOWN,
)
except Exception as e:
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading %s: %r", url_to_download, e)

raise SynapseError(
500,
"Failed to download content: %s"
% (traceback.format_exception_only(sys.exc_info()[0], e),),
Codes.UNKNOWN,
)
await finish()

if b"Content-Type" in headers:
media_type = headers[b"Content-Type"][0].decode("ascii")
else:
media_type = "application/octet-stream"

download_name = get_filename_from_headers(headers)

# FIXME: we should calculate a proper expiration based on the
# Cache-Control and Expire headers. But for now, assume 1 hour.
expires = ONE_HOUR
etag = headers["ETag"][0] if "ETag" in headers else None
else:
html_bytes = oembed_result.html.encode("utf-8") # type: ignore
with self.media_storage.store_into_file(file_info) as (f, fname, finish):
f.write(html_bytes)
await finish()

media_type = "text/html"
download_name = oembed_result.title
length = len(html_bytes)
# If a specific cache age was not given, assume 1 hour.
expires = oembed_result.cache_age or ONE_HOUR
uri = oembed_url
code = 200
etag = None

try:
if b"Content-Type" in headers:
media_type = headers[b"Content-Type"][0].decode("ascii")
else:
media_type = "application/octet-stream"
time_now_ms = self.clock.time_msec()

download_name = get_filename_from_headers(headers)

await self.store.store_local_media(
media_id=file_id,
media_type=media_type,
time_now_ms=self.clock.time_msec(),
time_now_ms=time_now_ms,
upload_name=download_name,
media_length=length,
user_id=user,
Expand All @@ -389,10 +566,8 @@ async def _download_url(self, url, user):
"filename": fname,
"uri": uri,
"response_code": code,
# FIXME: we should calculate a proper expiration based on the
# Cache-Control and Expire headers. But for now, assume 1 hour.
"expires": 60 * 60 * 1000,
"etag": headers["ETag"][0] if "ETag" in headers else None,
"expires": expires,
"etag": etag,
}

def _start_expire_url_cache_data(self):
Expand Down Expand Up @@ -449,7 +624,7 @@ async def _expire_url_cache_data(self):
# These may be cached for a bit on the client (i.e., they
# may have a room open with a preview url thing open).
# So we wait a couple of days before deleting, just in case.
expire_before = now - 2 * 24 * 60 * 60 * 1000
expire_before = now - 2 * 24 * ONE_HOUR
media_ids = await self.store.get_url_cache_media_before(expire_before)

removed_media = []
Expand Down
Loading

0 comments on commit 3fc8fdd

Please sign in to comment.