From d833855793d3dec66ee179bafb53c1d5edfbaa05 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sat, 1 Apr 2017 03:43:03 +0100 Subject: [PATCH 1/5] always treat repo filenames as utf-8 fixing https://github.com/vector-im/riot-web/issues/3155 --- synapse/rest/media/v1/_base.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py index b9600f216709..21e3d86a228d 100644 --- a/synapse/rest/media/v1/_base.py +++ b/synapse/rest/media/v1/_base.py @@ -71,20 +71,17 @@ def respond_with_file(request, media_type, file_path, if os.path.isfile(file_path): request.setHeader(b"Content-Type", media_type.encode("UTF-8")) if upload_name: - if is_ascii(upload_name): - request.setHeader( - b"Content-Disposition", - b"inline; filename=%s" % ( - urllib.quote(upload_name.encode("utf-8")), - ), - ) - else: - request.setHeader( - b"Content-Disposition", - b"inline; filename*=utf-8''%s" % ( - urllib.quote(upload_name.encode("utf-8")), - ), - ) + # technically we should only use the crazy *=utf-8'' syntax + # if the filename actually contains utf-8. but if we don't, some + # browsers (Chrome at some point before 57, and Firefox 52) get it + # get it wrong and return %20 verbatim rather than url-decoding it + # causing bug https://github.com/vector-im/riot-web/issues/3155 + request.setHeader( + b"Content-Disposition", + b"inline; filename*=utf-8''%s" % ( + urllib.quote(upload_name.encode("utf-8")), + ), + ) # cache for at least a day. # XXX: we might want to turn this off for data we don't want to From 70c88614d178d4a06bf9a4fcdad5f81ebe8c3540 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sat, 1 Apr 2017 04:08:35 +0100 Subject: [PATCH 2/5] better commenting. btw, it looks like this never particularly deliberately special-cased non-utf8; when Mjark added utf8 support in https://github.com/matrix-org/synapse/pull/259 he just left the original code path alone, assuming it worked. --- synapse/rest/media/v1/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py index 21e3d86a228d..54a931fad041 100644 --- a/synapse/rest/media/v1/_base.py +++ b/synapse/rest/media/v1/_base.py @@ -73,7 +73,7 @@ def respond_with_file(request, media_type, file_path, if upload_name: # technically we should only use the crazy *=utf-8'' syntax # if the filename actually contains utf-8. but if we don't, some - # browsers (Chrome at some point before 57, and Firefox 52) get it + # browsers (Firefox 52, Safari 10, Chrome at some point < 57) get it # get it wrong and return %20 verbatim rather than url-decoding it # causing bug https://github.com/vector-im/riot-web/issues/3155 request.setHeader( From 918920acede49f05fd2b32fd28d1c64255d6d51d Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Wed, 27 Feb 2019 23:08:47 +0000 Subject: [PATCH 3/5] Update to use the ascii filename where possible --- synapse/rest/media/v1/_base.py | 58 ++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 6 deletions(-) diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py index d247145312bc..45fcfce06730 100644 --- a/synapse/rest/media/v1/_base.py +++ b/synapse/rest/media/v1/_base.py @@ -99,12 +99,29 @@ def _quote(x): request.setHeader(b"Content-Type", media_type.encode("UTF-8")) if upload_name: - # technically we should only use the crazy *=utf-8'' syntax - # if the filename actually contains utf-8. but if we don't, some - # browsers (Firefox 52, Safari 10, Chrome at some point < 57) get it - # get it wrong and return %20 verbatim rather than url-decoding it - # causing bug https://github.com/vector-im/riot-web/issues/3155 - disposition = "inline; filename*=utf-8''%s" % (_quote(upload_name), ) + # RFC6266 section 4.1 [1] defines both `filename` and `filename*`. + # + # `filename` is defined to be a `value`, which is defined by RFC2616 + # section 3.6 [2] to be a `token` or a `quoted-string`, where a `token` + # is (essentially) a single US-ASCII word, and a `quoted-string` is a + # US-ASCII string surrounded by double-quotes, using backslash as an + # escape charater. Note that %-encoding is *not* permitted. + # + # `filename*` is defined to be an `ext-value`, which is defined in + # RFC5987 section 3.2.1 [3] to be `charset "'" [ language ] "'" value-chars`, + # where `value-chars` is essentially a %-encoded string in the given charset. + # + # [1]: https://tools.ietf.org/html/rfc6266#section-4.1 + # [2]: https://tools.ietf.org/html/rfc2616#section-3.6 + # [3]: https://tools.ietf.org/html/rfc5987#section-3.2.1 + + # We avoid the quoted-string version of filename, because (a) synapse didn't + # correctly interpret those as of 0.99.2 and (b) they are a bit of a pain and we + # may as well just do the filename* version. + if _can_encode_filename_as_token(upload_name): + disposition = 'inline; filename=%s' % (upload_name, ) + else: + disposition = "inline; filename*=utf-8''%s" % (_quote(upload_name), ) request.setHeader(b"Content-Disposition", disposition.encode('ascii')) @@ -117,6 +134,35 @@ def _quote(x): request.setHeader(b"Content-Length", b"%d" % (file_size,)) +# separators as defined in RFC2616. SP and HT are handled separately. +# see _can_encode_filename_as_token. +_FILENAME_SEPARATOR_CHARS = set(( + "(", ")", "<", ">", "@", ",", ";", ":", "\\", '"', + "/", "[", "]", "?", "=", "{", "}", +)) + + +def _can_encode_filename_as_token(x): + for c in x: + # from RFC2616: + # + # token = 1* + # + # separators = "(" | ")" | "<" | ">" | "@" + # | "," | ";" | ":" | "\" | <"> + # | "/" | "[" | "]" | "?" | "=" + # | "{" | "}" | SP | HT + # + # CHAR = + # + # CTL = + # + if ord(c) >= 127 or ord(c) <= 32 or c in _FILENAME_SEPARATOR_CHARS: + return False + return True + + @defer.inlineCallbacks def respond_with_responder(request, responder, media_type, file_size, upload_name=None): """Responds to the request with given responder. If responder is None then From c758d36f045858fd6e73ad96ea21c321ef3193e9 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Wed, 27 Feb 2019 23:10:26 +0000 Subject: [PATCH 4/5] changelog --- changelog.d/2090.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/2090.bugfix diff --git a/changelog.d/2090.bugfix b/changelog.d/2090.bugfix new file mode 100644 index 000000000000..de2d22fcb8b1 --- /dev/null +++ b/changelog.d/2090.bugfix @@ -0,0 +1 @@ +Fix a bug where media with spaces in the name would get a corrupted name. From c40303366e05c86ccfdc805aed62c59767638c96 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff <1389908+richvdh@users.noreply.github.com> Date: Thu, 28 Feb 2019 09:26:29 +0000 Subject: [PATCH 5/5] clarify comment --- synapse/rest/media/v1/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py index e93ae88eb1af..953d89bd8203 100644 --- a/synapse/rest/media/v1/_base.py +++ b/synapse/rest/media/v1/_base.py @@ -116,7 +116,7 @@ def _quote(x): # [2]: https://tools.ietf.org/html/rfc2616#section-3.6 # [3]: https://tools.ietf.org/html/rfc5987#section-3.2.1 - # We avoid the quoted-string version of filename, because (a) synapse didn't + # We avoid the quoted-string version of `filename`, because (a) synapse didn't # correctly interpret those as of 0.99.2 and (b) they are a bit of a pain and we # may as well just do the filename* version. if _can_encode_filename_as_token(upload_name):