rust-lang · BurntSushi · Jul 10, 2016 · Jul 9, 2016 · Jul 9, 2016 · Jul 9, 2016
diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs
@@ -1336,6 +1336,9 @@ pub enum ErrorKind {
     /// This never returned if the parser is permitted to allow expressions
     /// that match arbitrary bytes.
     InvalidUtf8,
+    /// A character class was constructed such that it is empty.
+    /// e.g., `[^\d\D]`.
+    EmptyClass,
     /// Hints that destructuring should not be exhaustive.
     ///
     /// This enum may grow additional variants, so this makes sure clients
@@ -1398,6 +1401,7 @@ impl ErrorKind {
             FlagNotAllowed(_) => "flag not allowed",
             UnicodeNotAllowed => "Unicode features not allowed",
             InvalidUtf8 => "matching arbitrary bytes is not allowed",
+            EmptyClass => "empty character class",
             __Nonexhaustive => unreachable!(),
         }
     }
@@ -1507,6 +1511,8 @@ impl fmt::Display for ErrorKind {
                            (u) flag is not set."),
             InvalidUtf8 =>
                 write!(f, "Matching arbitrary bytes is not allowed."),
+            EmptyClass =>
+                write!(f, "Empty character classes are not allowed."),
             __Nonexhaustive => unreachable!(),
         }
     }

diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs
@@ -581,12 +581,18 @@ impl Parser {
                     _ => unreachable!(),
                 },
                 start => {
+                    if !self.flags.unicode {
+                        let _ = try!(self.codepoint_to_one_byte(start));
+                    }
                     self.bump();
                     try!(self.parse_class_range(&mut class, start));
                 }
             }
         }
         class = self.class_transform(negated, class).canonicalize();
+        if class.is_empty() {
+            return Err(self.err(ErrorKind::EmptyClass));
+        }
         Ok(Build::Expr(if self.flags.unicode {
             Expr::Class(class)
         } else {
@@ -639,7 +645,13 @@ impl Parser {
                 // Because `parse_escape` can never return `LeftParen`.
                 _ => unreachable!(),
             },
-            _ => self.bump(),
+            _ => {
+                let c = self.bump();
+                if !self.flags.unicode {
+                    let _ = try!(self.codepoint_to_one_byte(c));
+                }
+                c
+            }
         };
         if end < start {
             // e.g., [z-a]
@@ -1277,7 +1289,7 @@ mod tests {
         ErrorKind,
     };
     use unicode::regex::{PERLD, PERLS, PERLW};
-    use super::{LOWER, UPPER, Flags, Parser, ascii_class};
+    use super::{LOWER, UPPER, WORD, Flags, Parser, ascii_class};
 
     static YI: &'static [(char, char)] = &[
         ('\u{a000}', '\u{a48c}'), ('\u{a490}', '\u{a4c6}'),
@@ -2002,6 +2014,8 @@ mod tests {
 
         assert_eq!(pb(r"(?-u)[a]"), Expr::ClassBytes(bclass(&[(b'a', b'a')])));
         assert_eq!(pb(r"(?-u)[\x00]"), Expr::ClassBytes(bclass(&[(0, 0)])));
+        assert_eq!(pb(r"(?-u)[\xFF]"),
+                   Expr::ClassBytes(bclass(&[(0xFF, 0xFF)])));
         assert_eq!(pb("(?-u)[\n]"),
                    Expr::ClassBytes(bclass(&[(b'\n', b'\n')])));
         assert_eq!(pb(r"(?-u)[\n]"),
@@ -2127,10 +2141,10 @@ mod tests {
 
     #[test]
     fn class_multiple_class_negate_negate() {
-        let nperld = class(PERLD).negate();
+        let nperlw = class(PERLW).negate();
         let nyi = class(YI).negate();
-        let cls = CharClass::empty().merge(nperld).merge(nyi);
-        assert_eq!(p(r"[^\D\P{Yi}]"), Expr::Class(cls.negate()));
+        let cls = CharClass::empty().merge(nperlw).merge(nyi);
+        assert_eq!(p(r"[^\W\P{Yi}]"), Expr::Class(cls.negate()));
     }
 
     #[test]
@@ -2149,10 +2163,10 @@ mod tests {
 
     #[test]
     fn class_multiple_class_negate_negate_casei() {
-        let nperld = class(PERLD).negate();
+        let nperlw = class(PERLW).negate();
         let nyi = class(YI).negate();
-        let class = CharClass::empty().merge(nperld).merge(nyi);
-        assert_eq!(p(r"(?i)[^\D\P{Yi}]"),
+        let class = CharClass::empty().merge(nperlw).merge(nyi);
+        assert_eq!(p(r"(?i)[^\W\P{Yi}]"),
                    Expr::Class(class.case_fold().negate()));
     }
 
@@ -2236,10 +2250,10 @@ mod tests {
 
     #[test]
     fn ascii_classes_negate_multiple() {
-        let (nlower, nupper) = (class(LOWER).negate(), class(UPPER).negate());
-        let cls = CharClass::empty().merge(nlower).merge(nupper);
-        assert_eq!(p("[[:^lower:][:^upper:]]"), Expr::Class(cls.clone()));
-        assert_eq!(p("[^[:^lower:][:^upper:]]"), Expr::Class(cls.negate()));
+        let (nlower, nword) = (class(LOWER).negate(), class(WORD).negate());
+        let cls = CharClass::empty().merge(nlower).merge(nword);
+        assert_eq!(p("[[:^lower:][:^word:]]"), Expr::Class(cls.clone()));
+        assert_eq!(p("[^[:^lower:][:^word:]]"), Expr::Class(cls.negate()));
     }
 
     #[test]
@@ -2402,6 +2416,13 @@ mod tests {
         test_err!(r"☃(?-u:\pL)", 9, ErrorKind::UnicodeNotAllowed, flags);
     }
 
+    #[test]
+    fn unicode_class_literal_not_allowed() {
+        let flags = Flags { allow_bytes: true, .. Flags::default() };
+        test_err!(r"(?-u)[☃]", 6, ErrorKind::UnicodeNotAllowed, flags);
+        test_err!(r"(?-u)[☃-☃]", 6, ErrorKind::UnicodeNotAllowed, flags);
+    }
+
     #[test]
     fn unicode_hex_not_allowed() {
         let flags = Flags { allow_bytes: true, .. Flags::default() };
@@ -2725,6 +2746,7 @@ mod tests {
     fn error_class_empty_range() {
         test_err!("[]", 2, ErrorKind::UnexpectedClassEof);
         test_err!("[^]", 3, ErrorKind::UnexpectedClassEof);
+        test_err!(r"[^\d\D]", 7, ErrorKind::EmptyClass);
     }
 
     #[test]

diff --git a/src/backtrack.rs b/src/backtrack.rs
@@ -242,9 +242,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
                     ip = inst.goto1;
                 }
                 EmptyLook(ref inst) => {
-                    let prev = self.input.previous_char(at);
-                    let next = self.input.next_char(at);
-                    if inst.matches(prev, next) {
+                    if self.input.is_empty_match(at, inst) {
                         ip = inst.goto;
                     } else {
                         return false;

diff --git a/src/compile.rs b/src/compile.rs
@@ -372,6 +372,7 @@ impl Compiler {
     }
 
     fn c_class(&mut self, ranges: &[ClassRange]) -> Result {
+        assert!(!ranges.is_empty());
         if self.compiled.uses_bytes() {
             CompileClass {
                 c: self,

diff --git a/src/dfa.rs b/src/dfa.rs
@@ -1847,7 +1847,7 @@ mod tests {
             expected == got && state.flags() == StateFlags(flags)
         }
         QuickCheck::new()
-            .gen(StdGen::new(self::rand::thread_rng(), 70_000))
+            .gen(StdGen::new(self::rand::thread_rng(), 10_000))
             .quickcheck(p as fn(Vec<u32>, u8) -> bool);
     }
 

diff --git a/src/exec.rs b/src/exec.rs
@@ -589,7 +589,11 @@ impl<'c> ExecNoSync<'c> {
                 lits.find_start(&text[start..])
                     .map(|(s, e)| (start + s, start + e))
             }
-            AnchoredEnd => self.ro.suffixes.find_end(&text),
+            AnchoredEnd => {
+                let lits = &self.ro.suffixes;
+                lits.find_end(&text[start..])
+                    .map(|(s, e)| (start + s, start + e))
+            }
         }
     }
 
@@ -917,7 +921,7 @@ impl<'c> ExecNoSync<'c> {
                 matches,
                 slots,
                 quit_after_match,
-                ByteInput::new(text),
+                ByteInput::new(text, self.ro.nfa.only_utf8),
                 start)
         } else {
             pikevm::Fsm::exec(
@@ -945,7 +949,7 @@ impl<'c> ExecNoSync<'c> {
                 &self.cache,
                 matches,
                 slots,
-                ByteInput::new(text),
+                ByteInput::new(text, self.ro.nfa.only_utf8),
                 start)
         } else {
             backtrack::Bounded::exec(

diff --git a/src/input.rs b/src/input.rs
@@ -16,8 +16,9 @@ use std::u32;
 
 use syntax;
 
-use utf8::{decode_utf8, decode_last_utf8};
 use literals::LiteralSearcher;
+use prog::InstEmptyLook;
+use utf8::{decode_utf8, decode_last_utf8};
 
 /// Represents a location in the input.
 #[derive(Clone, Copy, Debug)]
@@ -83,6 +84,10 @@ pub trait Input {
     /// If no such character could be decoded, then `Char` is absent.
     fn previous_char(&self, at: InputAt) -> Char;
 
+    /// Return true if the given empty width instruction matches at the
+    /// input position given.
+    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
+
     /// Scan the input for a matching prefix.
     fn prefix_at(
         &self,
@@ -104,6 +109,10 @@ impl<'a, T: Input> Input for &'a T {
 
     fn previous_char(&self, at: InputAt) -> Char { (**self).previous_char(at) }
 
+    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
+        (**self).is_empty_match(at, empty)
+    }
+
     fn prefix_at(
         &self,
         prefixes: &LiteralSearcher,
@@ -155,6 +164,38 @@ impl<'t> Input for CharInput<'t> {
         decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
     }
 
+    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
+        use prog::EmptyLook::*;
+        match empty.look {
+            StartLine => {
+                let c = self.previous_char(at);
+                c.is_none() || c == '\n'
+            }
+            EndLine => {
+                let c = self.next_char(at);
+                c.is_none() || c == '\n'
+            }
+            StartText => self.previous_char(at).is_none(),
+            EndText => self.next_char(at).is_none(),
+            WordBoundary => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                c1.is_word_char() != c2.is_word_char()
+            }
+            NotWordBoundary => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                c1.is_word_char() == c2.is_word_char()
+            }
+            WordBoundaryAscii => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                c1.is_word_byte() != c2.is_word_byte()
+            }
+            NotWordBoundaryAscii => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                c1.is_word_byte() == c2.is_word_byte()
+            }
+        }
+    }
+
     fn prefix_at(
         &self,
         prefixes: &LiteralSearcher,
@@ -178,20 +219,26 @@ impl<'t> Input for CharInput<'t> {
 /// easy access to necessary Unicode decoding (used for word boundary look
 /// ahead/look behind).
 #[derive(Clone, Copy, Debug)]
-pub struct ByteInput<'t>(&'t [u8]);
+pub struct ByteInput<'t> {
+    text: &'t [u8],
+    only_utf8: bool,
+}
 
 impl<'t> ByteInput<'t> {
     /// Return a new byte-based input reader for the given string.
-    pub fn new(s: &'t [u8]) -> ByteInput<'t> {
-        ByteInput(s)
+    pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
+        ByteInput {
+            text: text,
+            only_utf8: only_utf8,
+        }
     }
 }
 
 impl<'t> ops::Deref for ByteInput<'t> {
     type Target = [u8];
 
     fn deref(&self) -> &[u8] {
-        self.0
+        self.text
     }
 }
 
@@ -213,6 +260,58 @@ impl<'t> Input for ByteInput<'t> {
         decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
     }
 
+    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
+        use prog::EmptyLook::*;
+        match empty.look {
+            StartLine => {
+                let c = self.previous_char(at);
+                c.is_none() || c == '\n'
+            }
+            EndLine => {
+                let c = self.next_char(at);
+                c.is_none() || c == '\n'
+            }
+            StartText => self.previous_char(at).is_none(),
+            EndText => self.next_char(at).is_none(),
+            WordBoundary => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                c1.is_word_char() != c2.is_word_char()
+            }
+            NotWordBoundary => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                c1.is_word_char() == c2.is_word_char()
+            }
+            WordBoundaryAscii => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                if self.only_utf8 {
+                    // If we must match UTF-8, then we can't match word
+                    // boundaries at invalid UTF-8.
+                    if c1.is_none() && !at.is_start() {
+                        return false;
+                    }
+                    if c2.is_none() && !at.is_end() {
+                        return false;
+                    }
+                }
+                c1.is_word_byte() != c2.is_word_byte()
+            }
+            NotWordBoundaryAscii => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                if self.only_utf8 {
+                    // If we must match UTF-8, then we can't match word
+                    // boundaries at invalid UTF-8.
+                    if c1.is_none() && !at.is_start() {
+                        return false;
+                    }
+                    if c2.is_none() && !at.is_end() {
+                        return false;
+                    }
+                }
+                c1.is_word_byte() == c2.is_word_byte()
+            }
+        }
+    }
+
     fn prefix_at(
         &self,
         prefixes: &LiteralSearcher,
@@ -222,11 +321,11 @@ impl<'t> Input for ByteInput<'t> {
     }
 
     fn len(&self) -> usize {
-        self.0.len()
+        self.text.len()
     }
 
     fn as_bytes(&self) -> &[u8] {
-        self.0
+        &self.text
     }
 }
 
@@ -276,7 +375,7 @@ impl Char {
     pub fn is_word_byte(self) -> bool {
         match char::from_u32(self.0) {
             None => false,
-            Some(c) if c <= '\u{FF}' => syntax::is_word_byte(c as u8),
+            Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8),
             Some(_) => false,
         }
     }

diff --git a/src/pikevm.rs b/src/pikevm.rs
@@ -322,9 +322,7 @@ impl<'r, I: Input> Fsm<'r, I> {
             nlist.set.insert(ip);
             match self.prog[ip] {
                 EmptyLook(ref inst) => {
-                    let prev = self.input.previous_char(at);
-                    let next = self.input.next_char(at);
-                    if inst.matches(prev, next) {
+                    if self.input.is_empty_match(at, inst) {
                         ip = inst.goto;
                     }
                 }