From a1809fb587c1e9e2baba36b1ef64b68f706a3df3 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 4 Aug 2016 18:58:12 -0400 Subject: [PATCH 1/4] Reset the DFA cache based on better approximation. Typically, when a DFA blows up in size, it happens for two reasons: 1. It accumulates many states. 2. Each state accumulates more and more NFA states. Our previous approximation for the size of the DFA accounted for (1) but used a constant for the size of (2). This can turn out to result in very large differences (in the MBs) between the approximate and actual size of the DFA. Since computing the actual size is expensive, we compute it as a sum as states are added. The end result is that we more stringently respect the memory set by the caller. --- src/dfa.rs | 81 +++++++++++++++++++++++------------------------------- 1 file changed, 34 insertions(+), 47 deletions(-) diff --git a/src/dfa.rs b/src/dfa.rs index 0216f25620..ed472efca2 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -152,6 +152,9 @@ struct CacheInner { /// The total number of times this cache has been flushed by the DFA /// because of space constraints. flush_count: u64, + /// The total heap size of the DFA's cache. We use this to determine when + /// we should flush the cache. + size: usize, } /// The transition table. @@ -420,18 +423,32 @@ impl Cache { pub fn new(prog: &Program) -> Self { // We add 1 to account for the special EOF byte. let num_byte_classes = (prog.byte_classes[255] as usize + 1) + 1; - Cache { + let starts = vec![STATE_UNKNOWN; 256]; + let mut cache = Cache { inner: CacheInner { compiled: HashMap::new(), trans: Transitions::new(num_byte_classes), states: vec![], - start_states: vec![STATE_UNKNOWN; 256], + start_states: starts, stack: vec![], flush_count: 0, + size: 0, }, qcur: SparseSet::new(prog.insts.len()), qnext: SparseSet::new(prog.insts.len()), - } + }; + cache.inner.reset_size(); + cache + } +} + +impl CacheInner { + /// Resets the cache size to account for fixed costs, such as the program + /// and stack sizes. + fn reset_size(&mut self) { + self.size = + (self.start_states.len() * mem::size_of::()) + + (self.stack.len() * mem::size_of::()); } } @@ -1151,7 +1168,9 @@ impl<'a> Fsm<'a> { } // If the cache has gotten too big, wipe it. if self.approximate_size() > self.prog.dfa_size_limit { + println!("clearing cache (size: {:?})", self.approximate_size()); if !self.clear_cache_and_save(current_state) { + println!("giving up"); // Ooops. DFA is giving up. return None; } @@ -1280,6 +1299,7 @@ impl<'a> Fsm<'a> { } else { None }; + self.cache.reset_size(); self.cache.trans.clear(); self.cache.states.clear(); self.cache.compiled.clear(); @@ -1454,6 +1474,11 @@ impl<'a> Fsm<'a> { } // Finally, put our actual state on to our heap of states and index it // so we can find it later. + self.cache.size += + self.cache.trans.state_heap_size() + + (2 * state.data.len()) + + (2 * mem::size_of::()) + + mem::size_of::(); self.cache.states.push(state.clone()); self.cache.compiled.insert(state, si); // Transition table and set of states and map should all be in sync. @@ -1536,51 +1561,8 @@ impl<'a> Fsm<'a> { /// be wiped. Namely, it is possible that for certain regexes on certain /// inputs, a new state could be created for every byte of input. (This is /// bad for memory use, so we bound it with a cache.) - /// - /// The approximation is guaranteed to be done in constant time (and - /// indeed, this requirement is why it's approximate). fn approximate_size(&self) -> usize { - use std::mem::size_of as size; - // Estimate that there are about 16 instructions per state consuming - // 20 = 4 + (15 * 1) bytes of space (1 byte because of delta encoding). - const STATE_HEAP: usize = 20 + 1; // one extra byte for flags - let compiled = - (self.cache.compiled.len() * (size::() + STATE_HEAP)) - + (self.cache.compiled.len() * size::()); - let states = - self.cache.states.len() - * (size::() - + STATE_HEAP - + (self.num_byte_classes() * size::())); - let start_states = self.cache.start_states.len() * size::(); - self.prog.approximate_size() + compiled + states + start_states - } - - /// Returns the actual heap space of the DFA. This visits every state in - /// the DFA. - #[allow(dead_code)] // useful for debugging - fn actual_size(&self) -> usize { - let mut compiled = 0; - for k in self.cache.compiled.keys() { - compiled += mem::size_of::(); - compiled += mem::size_of::(); - compiled += k.data.len() * mem::size_of::(); - } - let mut states = 0; - for s in &self.cache.states { - states += mem::size_of::(); - states += s.data.len() * mem::size_of::(); - } - compiled - + states - + (self.cache.trans.num_states() * - mem::size_of::() * - self.num_byte_classes()) - + (self.cache.start_states.len() * mem::size_of::()) - + (self.cache.stack.len() * mem::size_of::()) - + mem::size_of::() - + mem::size_of::() - + self.prog.approximate_size() // OK, not actual, but close enough + self.cache.size + self.prog.approximate_size() } } @@ -1628,6 +1610,11 @@ impl Transitions { self.table[si as usize + cls] } + /// The heap size, in bytes, of a single state in the transition table. + fn state_heap_size(&self) -> usize { + self.num_byte_classes * mem::size_of::() + } + /// Like `next`, but uses unchecked access and is therefore unsafe. unsafe fn next_unchecked(&self, si: StatePtr, cls: usize) -> StatePtr { debug_assert!((si as usize) < self.table.len()); From 225f8e190d21227bb77be3585c06df76d3406930 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 4 Aug 2016 19:36:09 -0400 Subject: [PATCH 2/4] Disable literal optimizations for partially anchored regexes. The specific problem here is that our literal search doesn't know about anchors, so it will try to search all of the detected literals in a regex. In a regex like `a|^b`, the literal `b` should only be searched for at the beginning of the haystack and in no other place. The right way to fix this is probably to make the literal detector smarter, but the literal detector is already too complex. Instead, this commit detects whether a regex is partially anchored (that is, when the regex has at least one matchable sub-expression that is anchored), and if so, disables the literal engine. Note that this doesn't disable all literal optimizations, just the optimization that opts out of regex engines entirely. Both the DFA and the NFA will still use literal prefixes to search. Namely, if it searches and finds a literal that needs to be anchored but isn't in the haystack, then the regex engine rules it out as a false positive. Fixes #268. --- regex-syntax/src/lib.rs | 30 ++++++++++++++++++++++++++++++ src/compile.rs | 6 ++++++ src/exec.rs | 10 +++++++++- src/prog.rs | 8 ++++++++ tests/regression.rs | 3 +++ 5 files changed, 56 insertions(+), 1 deletion(-) diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index ee96045f4a..147a490d57 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -528,6 +528,21 @@ impl Expr { } } + /// Returns true if and only if the expression has at least one matchable + /// sub-expression that must match the beginning of text. + pub fn has_anchored_start(&self) -> bool { + match *self { + Repeat { ref e, r, .. } => { + !r.matches_empty() && e.has_anchored_start() + } + Group { ref e, .. } => e.has_anchored_start(), + Concat(ref es) => es[0].has_anchored_start(), + Alternate(ref es) => es.iter().any(|e| e.has_anchored_start()), + StartText => true, + _ => false, + } + } + /// Returns true if and only if the expression is required to match at the /// end of the text. pub fn is_anchored_end(&self) -> bool { @@ -543,6 +558,21 @@ impl Expr { } } + /// Returns true if and only if the expression has at least one matchable + /// sub-expression that must match the beginning of text. + pub fn has_anchored_end(&self) -> bool { + match *self { + Repeat { ref e, r, .. } => { + !r.matches_empty() && e.has_anchored_end() + } + Group { ref e, .. } => e.has_anchored_end(), + Concat(ref es) => es[es.len() - 1].has_anchored_end(), + Alternate(ref es) => es.iter().any(|e| e.has_anchored_end()), + EndText => true, + _ => false, + } + } + /// Returns true if and only if the expression contains sub-expressions /// that can match arbitrary bytes. pub fn has_bytes(&self) -> bool { diff --git a/src/compile.rs b/src/compile.rs index 9db743f489..32d1f6ac9b 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -143,7 +143,9 @@ impl Compiler { // matching engine itself. let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; self.compiled.is_anchored_start = expr.is_anchored_start(); + self.compiled.has_anchored_start = expr.has_anchored_start(); self.compiled.is_anchored_end = expr.is_anchored_end(); + self.compiled.has_anchored_end = expr.has_anchored_end(); if self.compiled.needs_dotstar() { dotstar_patch = try!(self.c_dotstar()); self.compiled.start = dotstar_patch.entry; @@ -171,6 +173,10 @@ impl Compiler { exprs.iter().all(|e| e.is_anchored_start()); self.compiled.is_anchored_end = exprs.iter().all(|e| e.is_anchored_end()); + self.compiled.has_anchored_start = + exprs.iter().any(|e| e.has_anchored_start()); + self.compiled.has_anchored_end = + exprs.iter().any(|e| e.has_anchored_end()); let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; if self.compiled.needs_dotstar() { dotstar_patch = try!(self.c_dotstar()); diff --git a/src/exec.rs b/src/exec.rs index 65a3935a72..2e2e421c8b 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -1113,7 +1113,15 @@ impl ExecReadOnly { // If our set of prefixes is complete, then we can use it to find // a match in lieu of a regex engine. This doesn't quite work well in // the presence of multiple regexes, so only do it when there's one. - if self.res.len() == 1 { + // + // TODO(burntsushi): Also, don't try to match literals if the regex is + // partially anchored. We could technically do it, but we'd need to + // create two sets of literals: all of them and then the subset that + // aren't anchored. We would then only search for all of them when at + // the beginning of the input and use the subset in all other cases. + if self.res.len() == 1 + && !self.nfa.has_anchored_start + && !self.nfa.has_anchored_end { if self.nfa.prefixes.complete() { return if self.nfa.is_anchored_start { Literal(MatchLiteralType::AnchoredStart) diff --git a/src/prog.rs b/src/prog.rs index 36f2aff879..41ebde009b 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -52,6 +52,12 @@ pub struct Program { pub is_anchored_start: bool, /// Whether the regex must match at the end of the input. pub is_anchored_end: bool, + /// Whether the regex has at least one matchable sub-expression that must + /// match from the start of the input. + pub has_anchored_start: bool, + /// Whether the regex has at least one matchable sub-expression that must + /// match at the end of the input. + pub has_anchored_end: bool, /// Whether this program contains a Unicode word boundary instruction. pub has_unicode_word_boundary: bool, /// A possibly empty machine for very quickly matching prefix literals. @@ -91,6 +97,8 @@ impl Program { is_reverse: false, is_anchored_start: false, is_anchored_end: false, + has_anchored_start: false, + has_anchored_end: false, has_unicode_word_boundary: false, prefixes: LiteralSearcher::empty(), dfa_size_limit: 2 * (1<<20), diff --git a/tests/regression.rs b/tests/regression.rs index e694dd01b9..0d0cdd50ac 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -57,3 +57,6 @@ split!(split_on_word_boundary, r"\b", r"Should this (work?)", t!(" ("), t!("work"), t!("?)")]); matiter!(word_boundary_dfa, r"\b", "a b c", (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + +// See: https://github.com/rust-lang-nursery/regex/issues/268 +matiter!(partial_anchor, u!(r"^a|b"), "ba", (0, 1)); From 1882b2ca6f5f452e377f7fbf30cbf1b55c51930a Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 4 Aug 2016 20:17:43 -0400 Subject: [PATCH 3/4] Adjust the end of the haystack after a DFA match. If the caller asks for captures, and the DFA runs, and there's a match, and there are actually captures in the regex, then the haystack sent to the NFA is shortened to correspond to only the match plus some room at the end for matching zero-width assertions. This "room at the end" needs to be big enough to at least fit an UTF-8 encoded Unicode codepoint. Fixes #264. --- src/exec.rs | 6 +----- tests/regression.rs | 4 ++++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/exec.rs b/src/exec.rs index 2e2e421c8b..5d0541a13c 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -840,11 +840,7 @@ impl<'c> ExecNoSync<'c> { ) -> Option<(usize, usize)> { // We can't use match_end directly, because we may need to examine // one "character" after the end of a match for lookahead operators. - let e = if self.ro.nfa.uses_bytes() { - cmp::min(match_end + 1, text.len()) - } else { - cmp::min(next_utf8(text, match_end), text.len()) - }; + let e = cmp::min(next_utf8(text, match_end), text.len()); self.captures_nfa(slots, &text[..e], match_start) } diff --git a/tests/regression.rs b/tests/regression.rs index 0d0cdd50ac..3b7a1fe917 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -60,3 +60,7 @@ matiter!(word_boundary_dfa, r"\b", "a b c", // See: https://github.com/rust-lang-nursery/regex/issues/268 matiter!(partial_anchor, u!(r"^a|b"), "ba", (0, 1)); + +// See: https://github.com/rust-lang-nursery/regex/issues/264 +mat!(ascii_boundary_no_capture, u!(r"(?-u)\B"), "\u{28f3e}", Some((0, 0))); +mat!(ascii_boundary_capture, u!(r"(?-u)(\B)"), "\u{28f3e}", Some((0, 0))); From 16931b006d49d5371f8f2c96aacf8b530643f514 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 4 Aug 2016 20:41:09 -0400 Subject: [PATCH 4/4] Don't build regex-debug on Rust 1.3. Docopt uses lazy_static! 2.x, but lazy_static required a new minimum Rust version in 2.1. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index b8a3de88b5..9eed600032 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,8 +7,8 @@ rust: sudo: false script: - cargo build --verbose - - cargo build --verbose --manifest-path=regex-debug/Cargo.toml - if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then + cargo build --verbose --manifest-path=regex-debug/Cargo.toml; RUSTFLAGS="-C target-feature=+ssse3" cargo test --verbose --features 'simd-accel pattern'; else travis_wait cargo test --verbose;