Skip to content

Commit

Permalink
Merge pull request #186 from trishume/regex-cache-lazycell
Browse files Browse the repository at this point in the history
Cache compiled regexes with lazycell
  • Loading branch information
robinst authored Jul 5, 2018
2 parents 312a648 + 2c81736 commit 2a81b81
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 57 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ onig = { version = "3.2.1", optional = true }
walkdir = "2.0"
regex-syntax = { version = "0.4", optional = true }
lazy_static = "1.0"
lazycell = "1.0"
bitflags = "1.0"
plist = "0.3"
bincode = { version = "1.0", optional = true }
Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ extern crate walkdir;
extern crate regex_syntax;
#[macro_use]
extern crate lazy_static;
extern crate lazycell;
extern crate plist;
#[cfg(any(feature = "dump-load-rs", feature = "dump-load", feature = "dump-create"))]
extern crate bincode;
Expand All @@ -42,6 +43,7 @@ extern crate serde_json;
#[cfg(test)]
#[macro_use]
extern crate pretty_assertions;

pub mod highlighting;
pub mod parsing;
pub mod util;
Expand Down
46 changes: 26 additions & 20 deletions src/parsing/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -326,8 +326,8 @@ impl ParseState {

for (from_with_proto, ctx, captures) in context_chain {
for (pat_context_ptr, pat_index) in context_iter(ctx) {
let mut pat_context = pat_context_ptr.borrow_mut();
let match_pat = pat_context.match_at_mut(pat_index);
let pat_context = pat_context_ptr.borrow();
let match_pat = pat_context.match_at(pat_index);

if let Some(match_region) = self.search(
line, start, match_pat, captures, search_cache, regions
Expand Down Expand Up @@ -374,7 +374,7 @@ impl ParseState {
fn search(&self,
line: &str,
start: usize,
match_pat: &mut MatchPattern,
match_pat: &MatchPattern,
captures: Option<&(Region, String)>,
search_cache: &mut SearchCache,
regions: &mut Region)
Expand All @@ -396,24 +396,30 @@ impl ParseState {
}
}

match_pat.ensure_compiled_if_possible();
let refs_regex = if match_pat.has_captures && captures.is_some() {
let (matched, can_cache) = if match_pat.has_captures && captures.is_some() {
let &(ref region, ref s) = captures.unwrap();
Some(match_pat.compile_with_refs(region, s))
let regex = match_pat.regex_with_refs(region, s);
let matched = regex.search_with_param(
line,
start,
line.len(),
SearchOptions::SEARCH_OPTION_NONE,
Some(regions),
MatchParam::default(),
);
(matched, false)
} else {
None
let regex = match_pat.regex();
let matched = regex.search_with_param(
line,
start,
line.len(),
SearchOptions::SEARCH_OPTION_NONE,
Some(regions),
MatchParam::default()
);
(matched, true)
};
let regex = if let Some(ref rgx) = refs_regex {
rgx
} else {
match_pat.regex.as_ref().unwrap()
};
let matched = regex.search_with_param(line,
start,
line.len(),
SearchOptions::SEARCH_OPTION_NONE,
Some(regions),
MatchParam::default());

// If there's an error during search, treat it as non-matching.
// For example, in case of catastrophic backtracking, onig should
Expand All @@ -425,14 +431,14 @@ impl ParseState {
MatchOperation::None => match_start != match_end,
_ => true,
};
if refs_regex.is_none() && does_something {
if can_cache && does_something {
search_cache.insert(match_pat, Some(regions.clone()));
}
if does_something {
// print!("catch {} at {} on {}", match_pat.regex_str, match_start, line);
return Some(regions.clone());
}
} else if refs_regex.is_none() {
} else if can_cache {
search_cache.insert(match_pat, None);
}
return None;
Expand Down
100 changes: 72 additions & 28 deletions src/parsing/syntax_definition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
//! into this data structure?
use std::collections::{BTreeMap, HashMap};
use std::hash::Hash;
use lazycell::AtomicLazyCell;
use onig::{Regex, RegexOptions, Region, Syntax};
use std::rc::{Rc, Weak};
use std::cell::RefCell;
Expand Down Expand Up @@ -86,16 +87,17 @@ pub struct MatchIter {
index_stack: Vec<usize>,
}

#[derive(Debug, Eq, PartialEq, Serialize, Deserialize)]
#[derive(Debug, Serialize, Deserialize)]
pub struct MatchPattern {
pub has_captures: bool,
pub regex_str: String,
#[serde(skip_serializing, skip_deserializing)]
pub regex: Option<Regex>,
pub scope: Vec<Scope>,
pub captures: Option<CaptureMapping>,
pub operation: MatchOperation,
pub with_prototype: Option<ContextPtr>,

#[serde(skip_serializing, skip_deserializing, default = "AtomicLazyCell::new")]
regex: AtomicLazyCell<Regex>,
}

/// This wrapper only exists so that I can implement a serialization
Expand Down Expand Up @@ -186,14 +188,6 @@ impl Context {
_ => panic!("bad index to match_at"),
}
}

/// Returns a mutable reference, otherwise like `match_at`
pub fn match_at_mut(&mut self, index: usize) -> &mut MatchPattern {
match self.patterns[index] {
Pattern::Match(ref mut match_pat) => match_pat,
_ => panic!("bad index to match_at"),
}
}
}

impl ContextReference {
Expand Down Expand Up @@ -233,6 +227,25 @@ pub(crate) fn substitute_backrefs_in_regex<F>(regex_str: &str, substituter: F) -

impl MatchPattern {

pub fn new(
has_captures: bool,
regex_str: String,
scope: Vec<Scope>,
captures: Option<CaptureMapping>,
operation: MatchOperation,
with_prototype: Option<ContextPtr>,
) -> MatchPattern {
MatchPattern {
has_captures,
regex_str,
scope,
captures,
operation,
with_prototype,
regex: AtomicLazyCell::new(),
}
}

/// substitutes back-refs in Regex with regions from s
/// used for match patterns which refer to captures from the pattern
/// that pushed them.
Expand All @@ -244,33 +257,47 @@ impl MatchPattern {

/// Used by the parser to compile a regex which needs to reference
/// regions from another matched pattern.
pub fn compile_with_refs(&self, region: &Region, s: &str) -> Regex {
pub fn regex_with_refs(&self, region: &Region, s: &str) -> Regex {
// TODO don't panic on invalid regex
Regex::with_options(&self.regex_with_substitutes(region, s),
RegexOptions::REGEX_OPTION_CAPTURE_GROUP,
Syntax::default())
.unwrap()
}

fn compile_regex(&mut self) {
// TODO don't panic on invalid regex
let compiled = Regex::with_options(&self.regex_str,
RegexOptions::REGEX_OPTION_CAPTURE_GROUP,
Syntax::default())
.unwrap();
self.regex = Some(compiled);
pub fn regex(&self) -> &Regex {
if let Some(regex) = self.regex.borrow() {
regex
} else {
// TODO don't panic on invalid regex
let regex = Regex::with_options(
&self.regex_str,
RegexOptions::REGEX_OPTION_CAPTURE_GROUP,
Syntax::default(),
).unwrap();
// Fill returns an error if it has already been filled. This might
// happen if two threads race here. In that case, just use the value
// that won and is now in the cell.
self.regex.fill(regex).ok();
self.regex.borrow().unwrap()
}
}
}

/// Makes sure the regex is compiled if it doesn't have captures.
/// May compile the regex if it isn't, panicing if compilation fails.
#[inline]
pub fn ensure_compiled_if_possible(&mut self) {
if self.regex.is_none() && !self.has_captures {
self.compile_regex();
}
impl Eq for MatchPattern {}

impl PartialEq for MatchPattern {
fn eq(&self, other: &MatchPattern) -> bool {
self.has_captures == other.has_captures &&
self.regex_str == other.regex_str &&
self.scope == other.scope &&
self.captures == other.captures &&
self.operation == other.operation &&
self.with_prototype == other.with_prototype
}
}


impl Eq for LinkerLink {}

impl PartialEq for LinkerLink {
Expand Down Expand Up @@ -307,17 +334,18 @@ fn ordered_map<K, V, S>(map: &HashMap<K, V>, serializer: S) -> Result<S::Ok, S::
#[cfg(test)]
mod tests {
use super::*;

#[test]
fn can_compile_refs() {
use onig::{SearchOptions, Regex, Region};
let pat = MatchPattern {
has_captures: true,
regex_str: String::from(r"lol \\ \2 \1 '\9' \wz"),
regex: None,
scope: vec![],
captures: None,
operation: MatchOperation::None,
with_prototype: None,
regex: AtomicLazyCell::new(),
};
let r = Regex::new(r"(\\\[\]\(\))(b)(c)(d)(e)").unwrap();
let mut region = Region::new();
Expand All @@ -326,6 +354,22 @@ mod tests {

let regex_res = pat.regex_with_substitutes(&region, s);
assert_eq!(regex_res, r"lol \\ b \\\[\]\(\) '' \wz");
pat.compile_with_refs(&region, s);
pat.regex_with_refs(&region, s);
}

#[test]
fn caches_compiled_regex() {
let pat = MatchPattern {
has_captures: false,
regex_str: String::from(r"\w+"),
scope: vec![],
captures: None,
operation: MatchOperation::None,
with_prototype: None,
regex: AtomicLazyCell::new(),
};

assert!(pat.regex().is_match("test"));
assert!(pat.regex.filled());
}
}
17 changes: 8 additions & 9 deletions src/parsing/yaml_load.rs
Original file line number Diff line number Diff line change
Expand Up @@ -399,15 +399,14 @@ impl SyntaxDefinition {
None
};

let pattern = MatchPattern {
has_captures: has_captures,
regex_str: regex_str,
regex: None,
scope: scope,
captures: captures,
operation: operation,
with_prototype: with_prototype,
};
let pattern = MatchPattern::new(
has_captures,
regex_str,
scope,
captures,
operation,
with_prototype,
);

Ok(pattern)
}
Expand Down

0 comments on commit 2a81b81

Please sign in to comment.