diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b51df279..36d73f5d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ Deprecations: Performance improvements: +* [PERF #1087](https://github.com/BurntSushi/ripgrep/pull/1087): + ripgrep is smarter when detected literals are whitespace. * [PERF #1381](https://github.com/BurntSushi/ripgrep/pull/1381): Directory traversal is sped up with speculative ignore-file existence checks. * [PERF cd8ec38a](https://github.com/BurntSushi/ripgrep/commit/cd8ec38a): diff --git a/crates/regex/src/lib.rs b/crates/regex/src/lib.rs index 91fe93b1f..4004a6925 100644 --- a/crates/regex/src/lib.rs +++ b/crates/regex/src/lib.rs @@ -5,6 +5,7 @@ An implementation of `grep-matcher`'s `Matcher` trait for Rust's regex engine. #![deny(missing_docs)] extern crate aho_corasick; +extern crate bstr; extern crate grep_matcher; #[macro_use] extern crate log; diff --git a/crates/regex/src/literal.rs b/crates/regex/src/literal.rs index cbb49dbea..49bc4ca20 100644 --- a/crates/regex/src/literal.rs +++ b/crates/regex/src/literal.rs @@ -5,6 +5,7 @@ the regex engine doesn't look for inner literals. Since we're doing line based searching, we can use them, so we need to do it ourselves. */ +use bstr::ByteSlice; use regex_syntax::hir::literal::{Literal, Literals}; use regex_syntax::hir::{self, Hir, HirKind}; @@ -99,7 +100,12 @@ impl LiteralSets { // helps with case insensitive matching, which can generate lots of // inner required literals. let any_empty = req_lits.iter().any(|lit| lit.is_empty()); - if req.len() > lit.len() && req_lits.len() > 1 && !any_empty { + let any_white = has_only_whitespace(&req_lits); + if req.len() > lit.len() + && req_lits.len() > 1 + && !any_empty + && !any_white + { debug!("required literals found: {:?}", req_lits); let alts: Vec = req_lits .into_iter() @@ -121,7 +127,7 @@ impl LiteralSets { // with the highest minimum length and use that to build our "fast" // regex. // - // This manifest in fairly common scenarios. e.g., + // This manifests in fairly common scenarios. e.g., // // rg -w 'foo|bar|baz|quux' // @@ -159,12 +165,20 @@ impl LiteralSets { }; debug!("prefix/suffix literals found: {:?}", lits); + if has_only_whitespace(lits) { + debug!("dropping literals because one was whitespace"); + return None; + } let alts: Vec = lits.into_iter().map(|x| util::bytes_to_regex(x)).collect(); // We're matching raw bytes, so disable Unicode mode. Some(format!("(?-u:{})", alts.join("|"))) } else { debug!("required literal found: {:?}", util::show_bytes(lit)); + if lit.chars().all(|c| c.is_whitespace()) { + debug!("dropping literal because one was whitespace"); + return None; + } Some(format!("(?-u:{})", util::bytes_to_regex(&lit))) } } @@ -328,6 +342,17 @@ fn count_byte_class(cls: &hir::ClassBytes) -> u32 { cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum() } +/// Returns true if and only if any of the literals in the given set is +/// entirely whitespace. +fn has_only_whitespace(lits: &[Literal]) -> bool { + for lit in lits { + if lit.chars().all(|c| c.is_whitespace()) { + return true; + } + } + false +} + #[cfg(test)] mod tests { use super::LiteralSets;