From 1e3410ac0938729ac250388ce3945fec382dfe4a Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 18 Feb 2017 18:26:00 -0500 Subject: [PATCH] Fix a bug in UTF-8 decoding. It was possible for an invalid continuation byte to sneak through, which resulted in incorrect UTF-8 decoding results. Fixes #321 --- src/utf8.rs | 20 ++++++++++++++++++++ tests/macros.rs | 3 +-- tests/test_default_bytes.rs | 15 +++++++++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/src/utf8.rs b/src/utf8.rs index cd5641ace9..8f6ec1a022 100644 --- a/src/utf8.rs +++ b/src/utf8.rs @@ -92,6 +92,9 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> { return None; } let b1 = src[1]; + if 0b11_000000 & b1 != TAG_CONT { + return None; + } let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32); match cp { @@ -104,6 +107,12 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> { return None; } let (b1, b2) = (src[1], src[2]); + if 0b11_000000 & b1 != TAG_CONT { + return None; + } + if 0b11_000000 & b2 != TAG_CONT { + return None; + } let cp = ((b0 & !TAG_THREE) as u32) << 12 | ((b1 & !TAG_CONT) as u32) << 6 | ((b2 & !TAG_CONT) as u32); @@ -118,6 +127,15 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> { return None; } let (b1, b2, b3) = (src[1], src[2], src[3]); + if 0b11_000000 & b1 != TAG_CONT { + return None; + } + if 0b11_000000 & b2 != TAG_CONT { + return None; + } + if 0b11_000000 & b3 != TAG_CONT { + return None; + } let cp = ((b0 & !TAG_FOUR) as u32) << 18 | ((b1 & !TAG_CONT) as u32) << 12 | ((b2 & !TAG_CONT) as u32) << 6 @@ -236,6 +254,8 @@ mod tests { assert_eq!(decode_utf8(&[0xFF]), None); // Surrogate pair assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None); + // Invalid continuation byte. + assert_eq!(decode_utf8(&[0xD4, 0xC2]), None); // Bad lengths assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes diff --git a/tests/macros.rs b/tests/macros.rs index ba9cd9b4a5..2cd1463b4e 100644 --- a/tests/macros.rs +++ b/tests/macros.rs @@ -13,9 +13,8 @@ macro_rules! ismatch { ($name:ident, $re:expr, $text:expr, $ismatch:expr) => { #[test] fn $name() { - let text = text!($text); let re = regex!($re); - assert!($ismatch == re.is_match(text)); + assert!($ismatch == re.is_match(text!($text))); } }; } diff --git a/tests/test_default_bytes.rs b/tests/test_default_bytes.rs index f9378cffa5..b049b3d2aa 100644 --- a/tests/test_default_bytes.rs +++ b/tests/test_default_bytes.rs @@ -41,6 +41,21 @@ macro_rules! regex_set { include!("macros_bytes.rs"); include!("macros.rs"); +// A silly wrapper to make it possible to write and match raw bytes. +struct R<'a>(&'a [u8]); +impl<'a> R<'a> { fn as_bytes(&self) -> &'a [u8] { &self.0 } } + +// See: https://github.com/rust-lang/regex/issues/321 +// +// These tests are here because they do not have the same behavior in every +// regex engine. +mat!(invalid_utf8_nfa1, r".", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), Some((2, 3))); +mat!(invalid_utf8_nfa2, r"${2}ä", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), None); +mat!(invalid_utf8_nfa3, r".", R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"), + Some((1, 3))); +mat!(invalid_utf8_nfa4, r"${2}ä", R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"), + None); + mod api; mod bytes; mod crazy;