From 1e3410ac0938729ac250388ce3945fec382dfe4a Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 18 Feb 2017 18:26:00 -0500
Subject: [PATCH] Fix a bug in UTF-8 decoding.

It was possible for an invalid continuation byte to sneak through, which
resulted in incorrect UTF-8 decoding results.

Fixes #321
---
 src/utf8.rs                 | 20 ++++++++++++++++++++
 tests/macros.rs             |  3 +--
 tests/test_default_bytes.rs | 15 +++++++++++++++
 3 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/src/utf8.rs b/src/utf8.rs
index cd5641ace9..8f6ec1a022 100644
--- a/src/utf8.rs
+++ b/src/utf8.rs
@@ -92,6 +92,9 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
                 return None;
             }
             let b1 = src[1];
+            if 0b11_000000 & b1 != TAG_CONT {
+                return None;
+            }
             let cp = ((b0 & !TAG_TWO) as u32) << 6
                      | ((b1 & !TAG_CONT) as u32);
             match cp {
@@ -104,6 +107,12 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
                 return None;
             }
             let (b1, b2) = (src[1], src[2]);
+            if 0b11_000000 & b1 != TAG_CONT {
+                return None;
+            }
+            if 0b11_000000 & b2 != TAG_CONT {
+                return None;
+            }
             let cp = ((b0 & !TAG_THREE) as u32) << 12
                      | ((b1 & !TAG_CONT) as u32) << 6
                      | ((b2 & !TAG_CONT) as u32);
@@ -118,6 +127,15 @@ pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
                 return None;
             }
             let (b1, b2, b3) = (src[1], src[2], src[3]);
+            if 0b11_000000 & b1 != TAG_CONT {
+                return None;
+            }
+            if 0b11_000000 & b2 != TAG_CONT {
+                return None;
+            }
+            if 0b11_000000 & b3 != TAG_CONT {
+                return None;
+            }
             let cp = ((b0 & !TAG_FOUR) as u32) << 18
                      | ((b1 & !TAG_CONT) as u32) << 12
                      | ((b2 & !TAG_CONT) as u32) << 6
@@ -236,6 +254,8 @@ mod tests {
         assert_eq!(decode_utf8(&[0xFF]), None);
         // Surrogate pair
         assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None);
+        // Invalid continuation byte.
+        assert_eq!(decode_utf8(&[0xD4, 0xC2]), None);
         // Bad lengths
         assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes
         assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes
diff --git a/tests/macros.rs b/tests/macros.rs
index ba9cd9b4a5..2cd1463b4e 100644
--- a/tests/macros.rs
+++ b/tests/macros.rs
@@ -13,9 +13,8 @@ macro_rules! ismatch {
     ($name:ident, $re:expr, $text:expr, $ismatch:expr) => {
         #[test]
         fn $name() {
-            let text = text!($text);
             let re = regex!($re);
-            assert!($ismatch == re.is_match(text));
+            assert!($ismatch == re.is_match(text!($text)));
         }
     };
 }
diff --git a/tests/test_default_bytes.rs b/tests/test_default_bytes.rs
index f9378cffa5..b049b3d2aa 100644
--- a/tests/test_default_bytes.rs
+++ b/tests/test_default_bytes.rs
@@ -41,6 +41,21 @@ macro_rules! regex_set {
 include!("macros_bytes.rs");
 include!("macros.rs");
 
+// A silly wrapper to make it possible to write and match raw bytes.
+struct R<'a>(&'a [u8]);
+impl<'a> R<'a> { fn as_bytes(&self) -> &'a [u8] { &self.0 } }
+
+// See: https://github.com/rust-lang/regex/issues/321
+//
+// These tests are here because they do not have the same behavior in every
+// regex engine.
+mat!(invalid_utf8_nfa1, r".", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), Some((2, 3)));
+mat!(invalid_utf8_nfa2, r"${2}ä", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), None);
+mat!(invalid_utf8_nfa3, r".", R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"),
+     Some((1, 3)));
+mat!(invalid_utf8_nfa4, r"${2}ä", R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"),
+     None);
+
 mod api;
 mod bytes;
 mod crazy;