Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit df758e1

Browse files
authored
Use <meta> tags to discover the per-page encoding of html previews (#4183)
1 parent a51288e commit df758e1

File tree

3 files changed

+100
-9
lines changed

3 files changed

+100
-9
lines changed

changelog.d/4183.bugfix

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
URL previews now correctly decode non-UTF-8 text if the header contains a `<meta http-equiv="Content-Type"` header.

synapse/rest/media/v1/preview_url_resource.py

+22-9
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@
5353

5454
logger = logging.getLogger(__name__)
5555

56+
_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
57+
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
58+
5659

5760
class PreviewUrlResource(Resource):
5861
isLeaf = True
@@ -223,15 +226,25 @@ def _do_preview(self, url, user, ts):
223226
with open(media_info['filename'], 'rb') as file:
224227
body = file.read()
225228

226-
# clobber the encoding from the content-type, or default to utf-8
227-
# XXX: this overrides any <meta/> or XML charset headers in the body
228-
# which may pose problems, but so far seems to work okay.
229-
match = re.match(
230-
r'.*; *charset="?(.*?)"?(;|$)',
231-
media_info['media_type'],
232-
re.I
233-
)
234-
encoding = match.group(1) if match else "utf-8"
229+
encoding = None
230+
231+
# Let's try and figure out if it has an encoding set in a meta tag.
232+
# Limit it to the first 1kb, since it ought to be in the meta tags
233+
# at the top.
234+
match = _charset_match.search(body[:1000])
235+
236+
# If we find a match, it should take precedence over the
237+
# Content-Type header, so set it here.
238+
if match:
239+
encoding = match.group(1).decode('ascii')
240+
241+
# If we don't find a match, we'll look at the HTTP Content-Type, and
242+
# if that doesn't exist, we'll fall back to UTF-8.
243+
if not encoding:
244+
match = _content_type_match.match(
245+
media_info['media_type']
246+
)
247+
encoding = match.group(1) if match else "utf-8"
235248

236249
og = decode_and_calc_og(body, media_info['uri'], encoding)
237250

tests/rest/media/v1/test_url_preview.py

+77
Original file line numberDiff line numberDiff line change
@@ -162,3 +162,80 @@ def test_cache_returns_correct_type(self):
162162
self.assertEqual(
163163
channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
164164
)
165+
166+
def test_non_ascii_preview_httpequiv(self):
167+
168+
request, channel = self.make_request(
169+
"GET", "url_preview?url=matrix.org", shorthand=False
170+
)
171+
request.render(self.preview_url)
172+
self.pump()
173+
174+
# We've made one fetch
175+
self.assertEqual(len(self.fetches), 1)
176+
177+
end_content = (
178+
b'<html><head>'
179+
b'<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"/>'
180+
b'<meta property="og:title" content="\xe4\xea\xe0" />'
181+
b'<meta property="og:description" content="hi" />'
182+
b'</head></html>'
183+
)
184+
185+
self.fetches[0][0].callback(
186+
(
187+
end_content,
188+
(
189+
len(end_content),
190+
{
191+
b"Content-Length": [b"%d" % (len(end_content))],
192+
# This charset=utf-8 should be ignored, because the
193+
# document has a meta tag overriding it.
194+
b"Content-Type": [b'text/html; charset="utf8"'],
195+
},
196+
"https://example.com",
197+
200,
198+
),
199+
)
200+
)
201+
202+
self.pump()
203+
self.assertEqual(channel.code, 200)
204+
self.assertEqual(channel.json_body["og:title"], u"\u0434\u043a\u0430")
205+
206+
def test_non_ascii_preview_content_type(self):
207+
208+
request, channel = self.make_request(
209+
"GET", "url_preview?url=matrix.org", shorthand=False
210+
)
211+
request.render(self.preview_url)
212+
self.pump()
213+
214+
# We've made one fetch
215+
self.assertEqual(len(self.fetches), 1)
216+
217+
end_content = (
218+
b'<html><head>'
219+
b'<meta property="og:title" content="\xe4\xea\xe0" />'
220+
b'<meta property="og:description" content="hi" />'
221+
b'</head></html>'
222+
)
223+
224+
self.fetches[0][0].callback(
225+
(
226+
end_content,
227+
(
228+
len(end_content),
229+
{
230+
b"Content-Length": [b"%d" % (len(end_content))],
231+
b"Content-Type": [b'text/html; charset="windows-1251"'],
232+
},
233+
"https://example.com",
234+
200,
235+
),
236+
)
237+
)
238+
239+
self.pump()
240+
self.assertEqual(channel.code, 200)
241+
self.assertEqual(channel.json_body["og:title"], u"\u0434\u043a\u0430")

0 commit comments

Comments
 (0)