Skip to content

Commit

Permalink
Attempt to reencode malformed headers from Latin-1 to UTF8 (#830)
Browse files Browse the repository at this point in the history
* Attempt to reencode malformed headers from Latin-1 to UTF8

As brought up in #796, there may be scenarios where headers may contain
non-UTF8 characters (even though they're supposed to be ASCII). Appreciation
to @StefanKarpinski for the Latin-1 -> UTF-8 conversion code and the suggestion
to try reencoding before throwing an error. As proposed in this PR, the normal
header parsing path should be unaffected and only when we're unable to parse
a normal header will we attempt this reencoding.

Note that curl warns on the malformed header and filters it out.

* cleanup implementation and ensure tests pass

* fix

* one more cleanup
  • Loading branch information
quinnj authored May 25, 2022
1 parent 54a6c13 commit 832185f
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 0 deletions.
23 changes: 23 additions & 0 deletions src/Parsers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,28 @@ Return `Pair(field-name => field-value)` and
a `SubString` containing the remaining header-field lines.
"""
function parse_header_field(bytes::SubString{String})::Tuple{Header,SubString{String}}
# https://github.com/JuliaWeb/HTTP.jl/issues/796
# there may be certain scenarios where non-ascii characters are
# included (illegally) in the headers; curl warns on these
# "malformed headers" and ignores them. we attempt to re-encode
# these from latin-1 => utf-8 and then try to parse.
if !isvalid(bytes)
@warn "malformed HTTP header detected; attempting to re-encode from Latin-1 to UTF8"
rawbytes = codeunits(bytes)
buf = Base.StringVector(length(rawbytes) + count((0x80), rawbytes))
i = 0
for byte in rawbytes
if byte 0x80
buf[i += 1] = 0xc0 | (byte >> 6)
buf[i += 1] = 0x80 | (byte & 0x3f)
else
buf[i += 1] = byte
end
end
bytes = SubString(String(buf))
!isvalid(bytes) && @goto error
end

# First look for: field-name ":" field-value
re = access_threaded(header_field_regex_f, header_field_regex)
if exec(re, bytes)
Expand All @@ -235,6 +257,7 @@ function parse_header_field(bytes::SubString{String})::Tuple{Header,SubString{St
return (group(1, re, bytes) => unfold), nextbytes(re, bytes)
end

@label error
throw(ParseError(:INVALID_HEADER_FIELD, bytes))
end

Expand Down
10 changes: 10 additions & 0 deletions test/parser.jl
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,16 @@ end
@test length(r.headers) == 1
end
end

# https://github.com/JuliaWeb/HTTP.jl/issues/796
@testset "Latin-1 values in header" begin
reqstr = "GET / HTTP/1.1\r\n" * "link: <http://dx.doi.org/10.1016/j.cma.2021.114093>; rel=\"canonical\", <https://api.elsevier.com/content/article/PII:S0045782521004242?httpAccept=text/xml>; version=\"vor\"; type=\"text/xml\"; rel=\"item\", <https://api.elsevier.com/content/article/PII:S0045782521004242?httpAccept=text/plain>; version=\"vor\"; type=\"text/plain\"; rel=\"item\", <https://www.elsevier.com/tdm/userlicense/1.0/>; version=\"tdm\"; rel=\"license\", <http://orcid.org/0000-0003-2391-4086>; title=\"Santiago Badia\"; rel=\"author\", <http://orcid.org/0000-0001-5751-4561>; title=\"Alberto F. Mart\xedn\"; rel=\"author\"\r\n\r\n"
r = parse(HTTP.Messages.Request, reqstr)
@test r.method == "GET"
@test r.target == "/"
@test length(r.headers) == 1
@test r.headers[1][2] == "<http://dx.doi.org/10.1016/j.cma.2021.114093>; rel=\"canonical\", <https://api.elsevier.com/content/article/PII:S0045782521004242?httpAccept=text/xml>; version=\"vor\"; type=\"text/xml\"; rel=\"item\", <https://api.elsevier.com/content/article/PII:S0045782521004242?httpAccept=text/plain>; version=\"vor\"; type=\"text/plain\"; rel=\"item\", <https://www.elsevier.com/tdm/userlicense/1.0/>; version=\"tdm\"; rel=\"license\", <http://orcid.org/0000-0003-2391-4086>; title=\"Santiago Badia\"; rel=\"author\", <http://orcid.org/0000-0001-5751-4561>; title=\"Alberto F. Martín\"; rel=\"author\""
end
end

@testset "Responses" begin
Expand Down

0 comments on commit 832185f

Please sign in to comment.