Skip to content

Commit

Permalink
Fix method parsing to reject a leading space (#190)
Browse files Browse the repository at this point in the history
The `is_token` function, used exclusively for parsing the method in
a request line, allows more values than it should. In particular, it
allows a leading space to be parsed.  This problem is not exposed in
hyper, which revalidates any method extracted by httparse, otherwise
I'm sure this would have been noticed sooner!

Checking for a single range of valid bytes is very fast, so I've taken
care to make sure that making `is_token` more complicated doesn't
slow down the most common case.  While exploring a variety of options,
I found the existing benchmark scheme to be a bit misleading because
it would test only a single method at a time, so I've made a new
benchmark that roughly simulates a mix of requests.  Ultimately, what
I found to be a reasonable fix without any slowdown for the 99.9999%
case is to check `b'A'..=b'Z'` and then fall back to a "byte map".

Both methods and header names have the same set of allowed bytes, a
"token", but their uses are slightly different. I thought it would
make sense to rename `is_token` to `is_method_token`, to mimic
`is_header_name_token`.
  • Loading branch information
jeddenlea authored Oct 29, 2024
1 parent 97c7e6e commit 9f6702b
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 14 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ std = []

[dev-dependencies]
criterion = "0.3.5"
rand = "0.8.5"

[lib]
bench = false
Expand Down
38 changes: 35 additions & 3 deletions benches/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ fn version(c: &mut Criterion) {
}

fn method(c: &mut Criterion) {
fn _method(c: &mut Criterion, name: &str, input: &'static [u8]) {
fn _method(c: &mut Criterion, name: &str, input: &[u8]) {
c.benchmark_group("method")
.throughput(Throughput::Bytes(input.len() as u64))
.bench_function(name, |b| b.iter(|| {
Expand All @@ -193,10 +193,42 @@ fn method(c: &mut Criterion) {
// Common methods should be fast-pathed
const COMMON_METHODS: &[&str] = &["GET", "HEAD", "POST", "PUT", "DELETE", "CONNECT", "OPTIONS", "TRACE", "PATCH"];
for method in COMMON_METHODS {
_method(c, &method.to_lowercase(), format!("{} / HTTP/1.1\r\n", method).into_bytes().leak());
_method(c, &method.to_lowercase(), format!("{} / HTTP/1.1\r\n", method).as_bytes());
}
// Custom methods should be infrequent and thus not worth optimizing
_method(c, "custom", b"CUSTOM / HTTP/1.1\r\n");
_method(c, "w3!rd", b"w3!rd / HTTP/1.1\r\n");
}

fn many_requests(c: &mut Criterion) {
use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng};
let mut requests = [
("GET", 500),
("POST", 300),
("OPTIONS", 100),
("HEAD", 50),
("w3!r`d", 20),
]
.iter()
.flat_map(|&(method, count)| std::iter::repeat(method).take(count))
.map(|method| format!("{method} / HTTP/1.1\r\n\r\n"))
.collect::<Vec<_>>();
SliceRandom::shuffle(&mut *requests, &mut StdRng::seed_from_u64(0));

let total_bytes: usize = requests.iter().map(String::len).sum();

c.benchmark_group("many_requests")
.throughput(Throughput::Bytes(total_bytes as u64))
.measurement_time(Duration::from_secs(1))
.sample_size(1000)
.bench_function("_", |b| {
b.iter(|| {
requests.iter().for_each(|req| {
let mut b = httparse::_benchable::Bytes::new(black_box(req.as_bytes()));
httparse::_benchable::parse_method(&mut b).unwrap();
});
})
});
}

const WARMUP: Duration = Duration::from_millis(100);
Expand All @@ -205,6 +237,6 @@ const SAMPLES: usize = 200;
criterion_group!{
name = benches;
config = Criterion::default().sample_size(SAMPLES).warm_up_time(WARMUP).measurement_time(MTIME);
targets = req, req_short, resp, resp_short, uri, header, version, method
targets = req, req_short, resp, resp_short, uri, header, version, method, many_requests
}
criterion_main!(benches);
44 changes: 34 additions & 10 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ pub mod _benchable {
pub use super::iter::Bytes;
}

/// Determines if byte is a token char.
/// Determines if byte is a method token char.
///
/// > ```notrust
/// > token = 1*tchar
Expand All @@ -55,8 +55,12 @@ pub mod _benchable {
/// > ; any VCHAR, except delimiters
/// > ```
#[inline]
fn is_token(b: u8) -> bool {
b > 0x1F && b < 0x7F
fn is_method_token(b: u8) -> bool {
match b {
// For the majority case, this can be faster than the table lookup.
b'A'..=b'Z' => true,
_ => TOKEN_MAP[b as usize],
}
}

// ASCII codes to accept URI string.
Expand Down Expand Up @@ -95,7 +99,7 @@ pub(crate) fn is_uri_token(b: u8) -> bool {
URI_MAP[b as usize]
}

static HEADER_NAME_MAP: [bool; 256] = byte_map![
static TOKEN_MAP: [bool; 256] = byte_map![
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
Expand All @@ -116,7 +120,7 @@ static HEADER_NAME_MAP: [bool; 256] = byte_map![

#[inline]
pub(crate) fn is_header_name_token(b: u8) -> bool {
HEADER_NAME_MAP[b as usize]
TOKEN_MAP[b as usize]
}

static HEADER_VALUE_MAP: [bool; 256] = byte_map![
Expand Down Expand Up @@ -930,7 +934,7 @@ fn parse_reason<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
#[inline]
fn parse_token<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
let b = next!(bytes);
if !is_token(b) {
if !is_method_token(b) {
// First char must be a token char, it can't be a space which would indicate an empty token.
return Err(Error::Token);
}
Expand All @@ -939,10 +943,10 @@ fn parse_token<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
let b = next!(bytes);
if b == b' ' {
return Ok(Status::Complete(
// SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8.
// SAFETY: all bytes up till `i` must have been `is_method_token` and therefore also utf-8.
unsafe { str::from_utf8_unchecked(bytes.slice_skip(1)) },
));
} else if !is_token(b) {
} else if !is_method_token(b) {
return Err(Error::Token);
}
}
Expand All @@ -964,7 +968,7 @@ pub fn parse_uri<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
}

return Ok(Status::Complete(
// SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8.
// SAFETY: all bytes up till `i` must have been `is_method_token` and therefore also utf-8.
unsafe { str::from_utf8_unchecked(bytes.slice_skip(1)) },
));
} else {
Expand Down Expand Up @@ -1383,7 +1387,7 @@ pub fn parse_chunk_size(buf: &[u8])

#[cfg(test)]
mod tests {
use super::{Request, Response, Status, EMPTY_HEADER, parse_chunk_size};
use super::{Error, Request, Response, Status, EMPTY_HEADER, parse_chunk_size};

const NUM_OF_HEADERS: usize = 4;

Expand Down Expand Up @@ -2676,4 +2680,24 @@ mod tests {
assert_eq!(response.headers[0].name, "foo");
assert_eq!(response.headers[0].value, &b"bar"[..]);
}

#[test]
fn test_request_with_leading_space() {
let mut headers = [EMPTY_HEADER; 1];
let mut request = Request::new(&mut headers[..]);
let result = crate::ParserConfig::default()
.parse_request(&mut request, b" GET / HTTP/1.1\r\nfoo:bar\r\n\r\n");

assert_eq!(result, Err(Error::Token));
}

#[test]
fn test_request_with_invalid_method() {
let mut headers = [EMPTY_HEADER; 1];
let mut request = Request::new(&mut headers[..]);
let result = crate::ParserConfig::default()
.parse_request(&mut request, b"P()ST / HTTP/1.1\r\nfoo:bar\r\n\r\n");

assert_eq!(result, Err(Error::Token));
}
}
2 changes: 1 addition & 1 deletion src/simd/neon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ fn neon_code_matches_header_name_chars_table() {
unsafe {
assert!(byte_is_allowed(b'_', match_header_name_vectored));

for (b, allowed) in crate::HEADER_NAME_MAP.iter().cloned().enumerate() {
for (b, allowed) in crate::TOKEN_MAP.iter().cloned().enumerate() {
assert_eq!(
byte_is_allowed(b as u8, match_header_name_vectored),
allowed,
Expand Down

0 comments on commit 9f6702b

Please sign in to comment.