Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Autodiscover oEmbed URLs from HTML.
Browse files Browse the repository at this point in the history
  • Loading branch information
clokep committed Sep 22, 2021
1 parent ba7a41c commit 8e022a1
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 1 deletion.
26 changes: 26 additions & 0 deletions synapse/rest/media/v1/oembed.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,32 @@ def get_oembed_url(self, url: str) -> Optional[str]:
# No match.
return None

def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
"""
Search an HTML document for oEmbed autodiscovery information.
Args:
tree: The parsed HTML body.
Returns:
The URL to use for oEmbed information, or None if no URL was found.
"""
# Search for link elements with the proper rel and type attributes.
for tag in tree.xpath(
"//link[@rel='alternate'][@type='application/json+oembed']"
):
if "href" in tag.attrib:
return tag.attrib["href"]

# Some providers (e.g. Flickr) use alternative instead of alternate.
for tag in tree.xpath(
"//link[@rel='alternative'][@type='application/json+oembed']"
):
if "href" in tag.attrib:
return tag.attrib["href"]

return None

def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
"""
Parse the oEmbed response into an Open Graph response.
Expand Down
16 changes: 15 additions & 1 deletion synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,21 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes:
encoding = get_html_media_encoding(body, media_info.media_type)
tree = decode_body(body, encoding)
if tree is not None:
og = _calc_og(tree, media_info.uri)
# Check if this HTML document points to oEmbed information and
# defer to that.
oembed_url = self._oembed.autodiscover_from_html(tree)
og = {}
if oembed_url:
oembed_info = await self._download_url(oembed_url, user)
og, expiration_ms = await self._handle_oembed_response(
url, oembed_info, expiration_ms
)

# If there was no oEmbed URL (or oEmbed parsing failed), attempt
# to generate the Open Graph information from the HTML.
if not oembed_url or not og:
og = _calc_og(tree, media_info.uri)

await self._precache_image_url(user, media_info, og)
else:
og = {}
Expand Down
101 changes: 101 additions & 0 deletions tests/rest/media/v1/test_url_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,3 +721,104 @@ def test_oembed_format(self):
"og:description": "Content Preview",
},
)

def test_oembed_autodiscovery(self):
"""
Autodiscovery works by finding the link in the HTML response and then requesting an oEmbed URL.
1. Request a preview of a URL which is not known to the oEmbed code.
2. It returns HTML including a link to an oEmbed preview.
3. The oEmbed preview is requested and returns a URL for an image.
4. The image is requested for thumbnailing.
"""
# This is a little cheesy in that we use the www subdomain (which isn't the
# list of oEmbed patterns) to get "raw" HTML response.
self.lookups["www.twitter.com"] = [(IPv4Address, "10.1.2.3")]
self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]

result = b"""
<link rel="alternate" type="application/json+oembed"
href="http://publish.twitter.com/oembed?url=http%3A%2F%2Fcdn.twitter.com%2Fmatrixdotorg%2Fstatus%2F12345&format=json"
title="matrixdotorg" />
"""

channel = self.make_request(
"GET",
"preview_url?url=http://www.twitter.com/matrixdotorg/status/12345",
shorthand=False,
await_result=False,
)
self.pump()

client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: text/html; charset="utf8"\r\n\r\n'
)
% (len(result),)
+ result
)

self.pump()

# The oEmbed response.
result2 = {
"version": "1.0",
"type": "photo",
"url": "http://cdn.twitter.com/matrixdotorg",
}
oembed_content = json.dumps(result2).encode("utf-8")

# Ensure a second request is made to the oEmbed URL.
client = self.reactor.tcpClients[1][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: application/json; charset="utf8"\r\n\r\n'
)
% (len(oembed_content),)
+ oembed_content
)

self.pump()

# Ensure the URL is what was requested.
self.assertIn(b"/oembed?", server.data)

# Ensure a third request is made to the photo URL.
client = self.reactor.tcpClients[2][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b"Content-Type: image/png\r\n\r\n"
)
% (len(SMALL_PNG),)
+ SMALL_PNG
)

self.pump()

# Ensure the URL is what was requested.
self.assertIn(b"/matrixdotorg", server.data)

self.assertEqual(channel.code, 200)
body = channel.json_body
self.assertEqual(
body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345"
)
self.assertTrue(body["og:image"].startswith("mxc://"))
self.assertEqual(body["og:image:height"], 1)
self.assertEqual(body["og:image:width"], 1)
self.assertEqual(body["og:image:type"], "image/png")

0 comments on commit 8e022a1

Please sign in to comment.