Fix method parsing to reject a leading space (#190)

The `is_token` function, used exclusively for parsing the method in a request line, allows more values than it should. In particular, it allows a leading space to be parsed. This problem is not exposed in hyper, which revalidates any method extracted by httparse, otherwise I'm sure this would have been noticed sooner! Checking for a single range of valid bytes is very fast, so I've taken care to make sure that making `is_token` more complicated doesn't slow down the most common case. While exploring a variety of options, I found the existing benchmark scheme to be a bit misleading because it would test only a single method at a time, so I've made a new benchmark that roughly simulates a mix of requests. Ultimately, what I found to be a reasonable fix without any slowdown for the 99.9999% case is to check `b'A'..=b'Z'` and then fall back to a "byte map". Both methods and header names have the same set of allowed bytes, a "token", but their uses are slightly different. I thought it would make sense to rename `is_token` to `is_method_token`, to mimic `is_header_name_token`.
seanmonstar · Oct 29, 2024 · 9f6702b · 9f6702b
1 parent 97c7e6e
commit 9f6702b
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 14 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -18,6 +18,7 @@ std = []
 
 [dev-dependencies]
 criterion = "0.3.5"
+rand = "0.8.5"
 
 [lib]
 bench = false

diff --git a/benches/parse.rs b/benches/parse.rs
@@ -181,7 +181,7 @@ fn version(c: &mut Criterion) {
 }
 
 fn method(c: &mut Criterion) {
-    fn _method(c: &mut Criterion, name: &str, input: &'static [u8]) {
+    fn _method(c: &mut Criterion, name: &str, input: &[u8]) {
         c.benchmark_group("method")
         .throughput(Throughput::Bytes(input.len() as u64))
         .bench_function(name, |b| b.iter(|| {
@@ -193,10 +193,42 @@ fn method(c: &mut Criterion) {
     // Common methods should be fast-pathed
     const COMMON_METHODS: &[&str] = &["GET", "HEAD", "POST", "PUT", "DELETE", "CONNECT", "OPTIONS", "TRACE", "PATCH"];
     for method in COMMON_METHODS {
-        _method(c, &method.to_lowercase(), format!("{} / HTTP/1.1\r\n", method).into_bytes().leak());
+        _method(c, &method.to_lowercase(), format!("{} / HTTP/1.1\r\n", method).as_bytes());
     }
     // Custom methods should be infrequent and thus not worth optimizing
     _method(c, "custom", b"CUSTOM / HTTP/1.1\r\n");
+    _method(c, "w3!rd", b"w3!rd / HTTP/1.1\r\n");
+}
+
+fn many_requests(c: &mut Criterion) {
+    use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng};
+    let mut requests = [
+        ("GET", 500),
+        ("POST", 300),
+        ("OPTIONS", 100),
+        ("HEAD", 50),
+        ("w3!r`d", 20),
+    ]
+    .iter()
+    .flat_map(|&(method, count)| std::iter::repeat(method).take(count))
+    .map(|method| format!("{method} / HTTP/1.1\r\n\r\n"))
+    .collect::<Vec<_>>();
+    SliceRandom::shuffle(&mut *requests, &mut StdRng::seed_from_u64(0));
+
+    let total_bytes: usize = requests.iter().map(String::len).sum();
+
+    c.benchmark_group("many_requests")
+        .throughput(Throughput::Bytes(total_bytes as u64))
+        .measurement_time(Duration::from_secs(1))
+        .sample_size(1000)
+        .bench_function("_", |b| {
+            b.iter(|| {
+                requests.iter().for_each(|req| {
+                    let mut b = httparse::_benchable::Bytes::new(black_box(req.as_bytes()));
+                    httparse::_benchable::parse_method(&mut b).unwrap();
+                });
+            })
+        });
 }
 
 const WARMUP: Duration = Duration::from_millis(100);
@@ -205,6 +237,6 @@ const SAMPLES: usize = 200;
 criterion_group!{
     name = benches;
     config = Criterion::default().sample_size(SAMPLES).warm_up_time(WARMUP).measurement_time(MTIME);
-    targets = req, req_short, resp, resp_short, uri, header, version, method
+    targets = req, req_short, resp, resp_short, uri, header, version, method, many_requests
 }
 criterion_main!(benches);
diff --git a/src/lib.rs b/src/lib.rs
@@ -44,7 +44,7 @@ pub mod _benchable {
     pub use super::iter::Bytes;
 }
 
-/// Determines if byte is a token char.
+/// Determines if byte is a method token char.
 ///
 /// > ```notrust
 /// > token          = 1*tchar
@@ -55,8 +55,12 @@ pub mod _benchable {
 /// >                ; any VCHAR, except delimiters
 /// > ```
 #[inline]
-fn is_token(b: u8) -> bool {
-    b > 0x1F && b < 0x7F
+fn is_method_token(b: u8) -> bool {
+    match b {
+        // For the majority case, this can be faster than the table lookup.
+        b'A'..=b'Z' => true,
+        _ => TOKEN_MAP[b as usize],
+    }
 }
 
 // ASCII codes to accept URI string.
@@ -95,7 +99,7 @@ pub(crate) fn is_uri_token(b: u8) -> bool {
     URI_MAP[b as usize]
 }
 
-static HEADER_NAME_MAP: [bool; 256] = byte_map![
+static TOKEN_MAP: [bool; 256] = byte_map![
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
@@ -116,7 +120,7 @@ static HEADER_NAME_MAP: [bool; 256] = byte_map![
 
 #[inline]
 pub(crate) fn is_header_name_token(b: u8) -> bool {
-    HEADER_NAME_MAP[b as usize]
+    TOKEN_MAP[b as usize]
 }
 
 static HEADER_VALUE_MAP: [bool; 256] = byte_map![
@@ -930,7 +934,7 @@ fn parse_reason<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
 #[inline]
 fn parse_token<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
     let b = next!(bytes);
-    if !is_token(b) {
+    if !is_method_token(b) {
         // First char must be a token char, it can't be a space which would indicate an empty token.
         return Err(Error::Token);
     }
@@ -939,10 +943,10 @@ fn parse_token<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
         let b = next!(bytes);
         if b == b' ' {
             return Ok(Status::Complete(
-                // SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8.
+                // SAFETY: all bytes up till `i` must have been `is_method_token` and therefore also utf-8.
                 unsafe { str::from_utf8_unchecked(bytes.slice_skip(1)) },
             ));
-        } else if !is_token(b) {
+        } else if !is_method_token(b) {
             return Err(Error::Token);
         }
     }
@@ -964,7 +968,7 @@ pub fn parse_uri<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
         }
 
         return Ok(Status::Complete(
-            // SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8.
+            // SAFETY: all bytes up till `i` must have been `is_method_token` and therefore also utf-8.
             unsafe { str::from_utf8_unchecked(bytes.slice_skip(1)) },
         ));
     } else {
@@ -1383,7 +1387,7 @@ pub fn parse_chunk_size(buf: &[u8])
 
 #[cfg(test)]
 mod tests {
-    use super::{Request, Response, Status, EMPTY_HEADER, parse_chunk_size};
+    use super::{Error, Request, Response, Status, EMPTY_HEADER, parse_chunk_size};
 
     const NUM_OF_HEADERS: usize = 4;
 
@@ -2676,4 +2680,24 @@ mod tests {
         assert_eq!(response.headers[0].name, "foo");
         assert_eq!(response.headers[0].value, &b"bar"[..]);
     }
+
+    #[test]
+    fn test_request_with_leading_space() {
+        let mut headers = [EMPTY_HEADER; 1];
+        let mut request = Request::new(&mut headers[..]);
+        let result = crate::ParserConfig::default()
+            .parse_request(&mut request, b" GET / HTTP/1.1\r\nfoo:bar\r\n\r\n");
+
+        assert_eq!(result, Err(Error::Token));
+    }
+
+    #[test]
+    fn test_request_with_invalid_method() {
+        let mut headers = [EMPTY_HEADER; 1];
+        let mut request = Request::new(&mut headers[..]);
+        let result = crate::ParserConfig::default()
+            .parse_request(&mut request, b"P()ST / HTTP/1.1\r\nfoo:bar\r\n\r\n");
+
+        assert_eq!(result, Err(Error::Token));
+    }
 }
diff --git a/src/simd/neon.rs b/src/simd/neon.rs
@@ -235,7 +235,7 @@ fn neon_code_matches_header_name_chars_table() {
     unsafe {
         assert!(byte_is_allowed(b'_', match_header_name_vectored));
 
-        for (b, allowed) in crate::HEADER_NAME_MAP.iter().cloned().enumerate() {
+        for (b, allowed) in crate::TOKEN_MAP.iter().cloned().enumerate() {
             assert_eq!(
                 byte_is_allowed(b as u8, match_header_name_vectored),
                 allowed,