Use <meta> tags to discover the per-page encoding of html previews (#4183)

hawkowl · web-flow · commit df758e155dac · 2018-11-15T11:05:08.000-06:00
diff --git a/changelog.d/4183.bugfix b/changelog.d/4183.bugfix
@@ -0,0 +1 @@
+URL previews now correctly decode non-UTF-8 text if the header contains a `<meta http-equiv="Content-Type"` header.
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
@@ -53,6 +53,9 @@
 
 logger = logging.getLogger(__name__)
 
+_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
+_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
+
 
 class PreviewUrlResource(Resource):
     isLeaf = True
@@ -223,15 +226,25 @@ def _do_preview(self, url, user, ts):
             with open(media_info['filename'], 'rb') as file:
                 body = file.read()
 
-            # clobber the encoding from the content-type, or default to utf-8
-            # XXX: this overrides any <meta/> or XML charset headers in the body
-            # which may pose problems, but so far seems to work okay.
-            match = re.match(
-                r'.*; *charset="?(.*?)"?(;|$)',
-                media_info['media_type'],
-                re.I
-            )
-            encoding = match.group(1) if match else "utf-8"
+            encoding = None
+
+            # Let's try and figure out if it has an encoding set in a meta tag.
+            # Limit it to the first 1kb, since it ought to be in the meta tags
+            # at the top.
+            match = _charset_match.search(body[:1000])
+
+            # If we find a match, it should take precedence over the
+            # Content-Type header, so set it here.
+            if match:
+                encoding = match.group(1).decode('ascii')
+
+            # If we don't find a match, we'll look at the HTTP Content-Type, and
+            # if that doesn't exist, we'll fall back to UTF-8.
+            if not encoding:
+                match = _content_type_match.match(
+                    media_info['media_type']
+                )
+                encoding = match.group(1) if match else "utf-8"
 
             og = decode_and_calc_og(body, media_info['uri'], encoding)
 
diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py
@@ -162,3 +162,80 @@ def test_cache_returns_correct_type(self):
         self.assertEqual(
             channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
         )
+
+    def test_non_ascii_preview_httpequiv(self):
+
+        request, channel = self.make_request(
+            "GET", "url_preview?url=matrix.org", shorthand=False
+        )
+        request.render(self.preview_url)
+        self.pump()
+
+        # We've made one fetch
+        self.assertEqual(len(self.fetches), 1)
+
+        end_content = (
+            b'<html><head>'
+            b'<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"/>'
+            b'<meta property="og:title" content="\xe4\xea\xe0" />'
+            b'<meta property="og:description" content="hi" />'
+            b'</head></html>'
+        )
+
+        self.fetches[0][0].callback(
+            (
+                end_content,
+                (
+                    len(end_content),
+                    {
+                        b"Content-Length": [b"%d" % (len(end_content))],
+                        # This charset=utf-8 should be ignored, because the
+                        # document has a meta tag overriding it.
+                        b"Content-Type": [b'text/html; charset="utf8"'],
+                    },
+                    "https://example.com",
+                    200,
+                ),
+            )
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
+        self.assertEqual(channel.json_body["og:title"], u"\u0434\u043a\u0430")
+
+    def test_non_ascii_preview_content_type(self):
+
+        request, channel = self.make_request(
+            "GET", "url_preview?url=matrix.org", shorthand=False
+        )
+        request.render(self.preview_url)
+        self.pump()
+
+        # We've made one fetch
+        self.assertEqual(len(self.fetches), 1)
+
+        end_content = (
+            b'<html><head>'
+            b'<meta property="og:title" content="\xe4\xea\xe0" />'
+            b'<meta property="og:description" content="hi" />'
+            b'</head></html>'
+        )
+
+        self.fetches[0][0].callback(
+            (
+                end_content,
+                (
+                    len(end_content),
+                    {
+                        b"Content-Length": [b"%d" % (len(end_content))],
+                        b"Content-Type": [b'text/html; charset="windows-1251"'],
+                    },
+                    "https://example.com",
+                    200,
+                ),
+            )
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
+        self.assertEqual(channel.json_body["og:title"], u"\u0434\u043a\u0430")

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+URL previews now correctly decode non-UTF-8 text if the header contains a `<meta http-equiv="Content-Type"` header.