Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix several small bugs found from fuzzing #262

Merged
merged 6 commits into from
Jul 10, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions regex-syntax/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1336,6 +1336,9 @@ pub enum ErrorKind {
/// This never returned if the parser is permitted to allow expressions
/// that match arbitrary bytes.
InvalidUtf8,
/// A character class was constructed such that it is empty.
/// e.g., `[^\d\D]`.
EmptyClass,
/// Hints that destructuring should not be exhaustive.
///
/// This enum may grow additional variants, so this makes sure clients
Expand Down Expand Up @@ -1398,6 +1401,7 @@ impl ErrorKind {
FlagNotAllowed(_) => "flag not allowed",
UnicodeNotAllowed => "Unicode features not allowed",
InvalidUtf8 => "matching arbitrary bytes is not allowed",
EmptyClass => "empty character class",
__Nonexhaustive => unreachable!(),
}
}
Expand Down Expand Up @@ -1507,6 +1511,8 @@ impl fmt::Display for ErrorKind {
(u) flag is not set."),
InvalidUtf8 =>
write!(f, "Matching arbitrary bytes is not allowed."),
EmptyClass =>
write!(f, "Empty character classes are not allowed."),
__Nonexhaustive => unreachable!(),
}
}
Expand Down
46 changes: 34 additions & 12 deletions regex-syntax/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -581,12 +581,18 @@ impl Parser {
_ => unreachable!(),
},
start => {
if !self.flags.unicode {
let _ = try!(self.codepoint_to_one_byte(start));
}
self.bump();
try!(self.parse_class_range(&mut class, start));
}
}
}
class = self.class_transform(negated, class).canonicalize();
if class.is_empty() {
return Err(self.err(ErrorKind::EmptyClass));
}
Ok(Build::Expr(if self.flags.unicode {
Expr::Class(class)
} else {
Expand Down Expand Up @@ -639,7 +645,13 @@ impl Parser {
// Because `parse_escape` can never return `LeftParen`.
_ => unreachable!(),
},
_ => self.bump(),
_ => {
let c = self.bump();
if !self.flags.unicode {
let _ = try!(self.codepoint_to_one_byte(c));
}
c
}
};
if end < start {
// e.g., [z-a]
Expand Down Expand Up @@ -1277,7 +1289,7 @@ mod tests {
ErrorKind,
};
use unicode::regex::{PERLD, PERLS, PERLW};
use super::{LOWER, UPPER, Flags, Parser, ascii_class};
use super::{LOWER, UPPER, WORD, Flags, Parser, ascii_class};

static YI: &'static [(char, char)] = &[
('\u{a000}', '\u{a48c}'), ('\u{a490}', '\u{a4c6}'),
Expand Down Expand Up @@ -2002,6 +2014,8 @@ mod tests {

assert_eq!(pb(r"(?-u)[a]"), Expr::ClassBytes(bclass(&[(b'a', b'a')])));
assert_eq!(pb(r"(?-u)[\x00]"), Expr::ClassBytes(bclass(&[(0, 0)])));
assert_eq!(pb(r"(?-u)[\xFF]"),
Expr::ClassBytes(bclass(&[(0xFF, 0xFF)])));
assert_eq!(pb("(?-u)[\n]"),
Expr::ClassBytes(bclass(&[(b'\n', b'\n')])));
assert_eq!(pb(r"(?-u)[\n]"),
Expand Down Expand Up @@ -2127,10 +2141,10 @@ mod tests {

#[test]
fn class_multiple_class_negate_negate() {
let nperld = class(PERLD).negate();
let nperlw = class(PERLW).negate();
let nyi = class(YI).negate();
let cls = CharClass::empty().merge(nperld).merge(nyi);
assert_eq!(p(r"[^\D\P{Yi}]"), Expr::Class(cls.negate()));
let cls = CharClass::empty().merge(nperlw).merge(nyi);
assert_eq!(p(r"[^\W\P{Yi}]"), Expr::Class(cls.negate()));
}

#[test]
Expand All @@ -2149,10 +2163,10 @@ mod tests {

#[test]
fn class_multiple_class_negate_negate_casei() {
let nperld = class(PERLD).negate();
let nperlw = class(PERLW).negate();
let nyi = class(YI).negate();
let class = CharClass::empty().merge(nperld).merge(nyi);
assert_eq!(p(r"(?i)[^\D\P{Yi}]"),
let class = CharClass::empty().merge(nperlw).merge(nyi);
assert_eq!(p(r"(?i)[^\W\P{Yi}]"),
Expr::Class(class.case_fold().negate()));
}

Expand Down Expand Up @@ -2236,10 +2250,10 @@ mod tests {

#[test]
fn ascii_classes_negate_multiple() {
let (nlower, nupper) = (class(LOWER).negate(), class(UPPER).negate());
let cls = CharClass::empty().merge(nlower).merge(nupper);
assert_eq!(p("[[:^lower:][:^upper:]]"), Expr::Class(cls.clone()));
assert_eq!(p("[^[:^lower:][:^upper:]]"), Expr::Class(cls.negate()));
let (nlower, nword) = (class(LOWER).negate(), class(WORD).negate());
let cls = CharClass::empty().merge(nlower).merge(nword);
assert_eq!(p("[[:^lower:][:^word:]]"), Expr::Class(cls.clone()));
assert_eq!(p("[^[:^lower:][:^word:]]"), Expr::Class(cls.negate()));
}

#[test]
Expand Down Expand Up @@ -2402,6 +2416,13 @@ mod tests {
test_err!(r"☃(?-u:\pL)", 9, ErrorKind::UnicodeNotAllowed, flags);
}

#[test]
fn unicode_class_literal_not_allowed() {
let flags = Flags { allow_bytes: true, .. Flags::default() };
test_err!(r"(?-u)[☃]", 6, ErrorKind::UnicodeNotAllowed, flags);
test_err!(r"(?-u)[☃-☃]", 6, ErrorKind::UnicodeNotAllowed, flags);
}

#[test]
fn unicode_hex_not_allowed() {
let flags = Flags { allow_bytes: true, .. Flags::default() };
Expand Down Expand Up @@ -2725,6 +2746,7 @@ mod tests {
fn error_class_empty_range() {
test_err!("[]", 2, ErrorKind::UnexpectedClassEof);
test_err!("[^]", 3, ErrorKind::UnexpectedClassEof);
test_err!(r"[^\d\D]", 7, ErrorKind::EmptyClass);
}

#[test]
Expand Down
4 changes: 1 addition & 3 deletions src/backtrack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
ip = inst.goto1;
}
EmptyLook(ref inst) => {
let prev = self.input.previous_char(at);
let next = self.input.next_char(at);
if inst.matches(prev, next) {
if self.input.is_empty_match(at, inst) {
ip = inst.goto;
} else {
return false;
Expand Down
1 change: 1 addition & 0 deletions src/compile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ impl Compiler {
}

fn c_class(&mut self, ranges: &[ClassRange]) -> Result {
assert!(!ranges.is_empty());
if self.compiled.uses_bytes() {
CompileClass {
c: self,
Expand Down
2 changes: 1 addition & 1 deletion src/dfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1847,7 +1847,7 @@ mod tests {
expected == got && state.flags() == StateFlags(flags)
}
QuickCheck::new()
.gen(StdGen::new(self::rand::thread_rng(), 70_000))
.gen(StdGen::new(self::rand::thread_rng(), 10_000))
.quickcheck(p as fn(Vec<u32>, u8) -> bool);
}

Expand Down
10 changes: 7 additions & 3 deletions src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,11 @@ impl<'c> ExecNoSync<'c> {
lits.find_start(&text[start..])
.map(|(s, e)| (start + s, start + e))
}
AnchoredEnd => self.ro.suffixes.find_end(&text),
AnchoredEnd => {
let lits = &self.ro.suffixes;
lits.find_end(&text[start..])
.map(|(s, e)| (start + s, start + e))
}
}
}

Expand Down Expand Up @@ -917,7 +921,7 @@ impl<'c> ExecNoSync<'c> {
matches,
slots,
quit_after_match,
ByteInput::new(text),
ByteInput::new(text, self.ro.nfa.only_utf8),
start)
} else {
pikevm::Fsm::exec(
Expand Down Expand Up @@ -945,7 +949,7 @@ impl<'c> ExecNoSync<'c> {
&self.cache,
matches,
slots,
ByteInput::new(text),
ByteInput::new(text, self.ro.nfa.only_utf8),
start)
} else {
backtrack::Bounded::exec(
Expand Down
115 changes: 107 additions & 8 deletions src/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ use std::u32;

use syntax;

use utf8::{decode_utf8, decode_last_utf8};
use literals::LiteralSearcher;
use prog::InstEmptyLook;
use utf8::{decode_utf8, decode_last_utf8};

/// Represents a location in the input.
#[derive(Clone, Copy, Debug)]
Expand Down Expand Up @@ -83,6 +84,10 @@ pub trait Input {
/// If no such character could be decoded, then `Char` is absent.
fn previous_char(&self, at: InputAt) -> Char;

/// Return true if the given empty width instruction matches at the
/// input position given.
fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;

/// Scan the input for a matching prefix.
fn prefix_at(
&self,
Expand All @@ -104,6 +109,10 @@ impl<'a, T: Input> Input for &'a T {

fn previous_char(&self, at: InputAt) -> Char { (**self).previous_char(at) }

fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
(**self).is_empty_match(at, empty)
}

fn prefix_at(
&self,
prefixes: &LiteralSearcher,
Expand Down Expand Up @@ -155,6 +164,38 @@ impl<'t> Input for CharInput<'t> {
decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
}

fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
use prog::EmptyLook::*;
match empty.look {
StartLine => {
let c = self.previous_char(at);
c.is_none() || c == '\n'
}
EndLine => {
let c = self.next_char(at);
c.is_none() || c == '\n'
}
StartText => self.previous_char(at).is_none(),
EndText => self.next_char(at).is_none(),
WordBoundary => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
c1.is_word_char() != c2.is_word_char()
}
NotWordBoundary => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
c1.is_word_char() == c2.is_word_char()
}
WordBoundaryAscii => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
c1.is_word_byte() != c2.is_word_byte()
}
NotWordBoundaryAscii => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
c1.is_word_byte() == c2.is_word_byte()
}
}
}

fn prefix_at(
&self,
prefixes: &LiteralSearcher,
Expand All @@ -178,20 +219,26 @@ impl<'t> Input for CharInput<'t> {
/// easy access to necessary Unicode decoding (used for word boundary look
/// ahead/look behind).
#[derive(Clone, Copy, Debug)]
pub struct ByteInput<'t>(&'t [u8]);
pub struct ByteInput<'t> {
text: &'t [u8],
only_utf8: bool,
}

impl<'t> ByteInput<'t> {
/// Return a new byte-based input reader for the given string.
pub fn new(s: &'t [u8]) -> ByteInput<'t> {
ByteInput(s)
pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
ByteInput {
text: text,
only_utf8: only_utf8,
}
}
}

impl<'t> ops::Deref for ByteInput<'t> {
type Target = [u8];

fn deref(&self) -> &[u8] {
self.0
self.text
}
}

Expand All @@ -213,6 +260,58 @@ impl<'t> Input for ByteInput<'t> {
decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
}

fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
use prog::EmptyLook::*;
match empty.look {
StartLine => {
let c = self.previous_char(at);
c.is_none() || c == '\n'
}
EndLine => {
let c = self.next_char(at);
c.is_none() || c == '\n'
}
StartText => self.previous_char(at).is_none(),
EndText => self.next_char(at).is_none(),
WordBoundary => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
c1.is_word_char() != c2.is_word_char()
}
NotWordBoundary => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
c1.is_word_char() == c2.is_word_char()
}
WordBoundaryAscii => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
if self.only_utf8 {
// If we must match UTF-8, then we can't match word
// boundaries at invalid UTF-8.
if c1.is_none() && !at.is_start() {
return false;
}
if c2.is_none() && !at.is_end() {
return false;
}
}
c1.is_word_byte() != c2.is_word_byte()
}
NotWordBoundaryAscii => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
if self.only_utf8 {
// If we must match UTF-8, then we can't match word
// boundaries at invalid UTF-8.
if c1.is_none() && !at.is_start() {
return false;
}
if c2.is_none() && !at.is_end() {
return false;
}
}
c1.is_word_byte() == c2.is_word_byte()
}
}
}

fn prefix_at(
&self,
prefixes: &LiteralSearcher,
Expand All @@ -222,11 +321,11 @@ impl<'t> Input for ByteInput<'t> {
}

fn len(&self) -> usize {
self.0.len()
self.text.len()
}

fn as_bytes(&self) -> &[u8] {
self.0
&self.text
}
}

Expand Down Expand Up @@ -276,7 +375,7 @@ impl Char {
pub fn is_word_byte(self) -> bool {
match char::from_u32(self.0) {
None => false,
Some(c) if c <= '\u{FF}' => syntax::is_word_byte(c as u8),
Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8),
Some(_) => false,
}
}
Expand Down
4 changes: 1 addition & 3 deletions src/pikevm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -322,9 +322,7 @@ impl<'r, I: Input> Fsm<'r, I> {
nlist.set.insert(ip);
match self.prog[ip] {
EmptyLook(ref inst) => {
let prev = self.input.previous_char(at);
let next = self.input.next_char(at);
if inst.matches(prev, next) {
if self.input.is_empty_match(at, inst) {
ip = inst.goto;
}
}
Expand Down
Loading