From 832185fa9c02afe7eb48ff7f69e84f6c520aa1b9 Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Wed, 25 May 2022 15:21:14 -0600 Subject: [PATCH] Attempt to reencode malformed headers from Latin-1 to UTF8 (#830) * Attempt to reencode malformed headers from Latin-1 to UTF8 As brought up in #796, there may be scenarios where headers may contain non-UTF8 characters (even though they're supposed to be ASCII). Appreciation to @StefanKarpinski for the Latin-1 -> UTF-8 conversion code and the suggestion to try reencoding before throwing an error. As proposed in this PR, the normal header parsing path should be unaffected and only when we're unable to parse a normal header will we attempt this reencoding. Note that curl warns on the malformed header and filters it out. * cleanup implementation and ensure tests pass * fix * one more cleanup --- src/Parsers.jl | 23 +++++++++++++++++++++++ test/parser.jl | 10 ++++++++++ 2 files changed, 33 insertions(+) diff --git a/src/Parsers.jl b/src/Parsers.jl index cb701662e..73527089b 100644 --- a/src/Parsers.jl +++ b/src/Parsers.jl @@ -215,6 +215,28 @@ Return `Pair(field-name => field-value)` and a `SubString` containing the remaining header-field lines. """ function parse_header_field(bytes::SubString{String})::Tuple{Header,SubString{String}} + # https://github.com/JuliaWeb/HTTP.jl/issues/796 + # there may be certain scenarios where non-ascii characters are + # included (illegally) in the headers; curl warns on these + # "malformed headers" and ignores them. we attempt to re-encode + # these from latin-1 => utf-8 and then try to parse. + if !isvalid(bytes) + @warn "malformed HTTP header detected; attempting to re-encode from Latin-1 to UTF8" + rawbytes = codeunits(bytes) + buf = Base.StringVector(length(rawbytes) + count(≥(0x80), rawbytes)) + i = 0 + for byte in rawbytes + if byte ≥ 0x80 + buf[i += 1] = 0xc0 | (byte >> 6) + buf[i += 1] = 0x80 | (byte & 0x3f) + else + buf[i += 1] = byte + end + end + bytes = SubString(String(buf)) + !isvalid(bytes) && @goto error + end + # First look for: field-name ":" field-value re = access_threaded(header_field_regex_f, header_field_regex) if exec(re, bytes) @@ -235,6 +257,7 @@ function parse_header_field(bytes::SubString{String})::Tuple{Header,SubString{St return (group(1, re, bytes) => unfold), nextbytes(re, bytes) end +@label error throw(ParseError(:INVALID_HEADER_FIELD, bytes)) end diff --git a/test/parser.jl b/test/parser.jl index db57e5a80..6b528d663 100644 --- a/test/parser.jl +++ b/test/parser.jl @@ -376,6 +376,16 @@ end @test length(r.headers) == 1 end end + + # https://github.com/JuliaWeb/HTTP.jl/issues/796 + @testset "Latin-1 values in header" begin + reqstr = "GET / HTTP/1.1\r\n" * "link: ; rel=\"canonical\", ; version=\"vor\"; type=\"text/xml\"; rel=\"item\", ; version=\"vor\"; type=\"text/plain\"; rel=\"item\", ; version=\"tdm\"; rel=\"license\", ; title=\"Santiago Badia\"; rel=\"author\", ; title=\"Alberto F. Mart\xedn\"; rel=\"author\"\r\n\r\n" + r = parse(HTTP.Messages.Request, reqstr) + @test r.method == "GET" + @test r.target == "/" + @test length(r.headers) == 1 + @test r.headers[1][2] == "; rel=\"canonical\", ; version=\"vor\"; type=\"text/xml\"; rel=\"item\", ; version=\"vor\"; type=\"text/plain\"; rel=\"item\", ; version=\"tdm\"; rel=\"license\", ; title=\"Santiago Badia\"; rel=\"author\", ; title=\"Alberto F. Martín\"; rel=\"author\"" + end end @testset "Responses" begin