diff --git a/CHANGELOG.md b/CHANGELOG.md index 5012596c1..c3be48888 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,11 +14,16 @@ CPUs (such as AVX2) for additional optimizations. per line. Previously, the behavior of ripgrep was to report the total number of matching lines. (Note that this behavior diverges from the behavior of GNU grep.) +* Octal syntax is no longer supported. ripgrep previously accepted expressions + like `\1` as syntax for matching `U+0001`, but ripgrep will now report an + error instead. Feature enhancements: * [FEATURE #411](https://github.com/BurntSushi/ripgrep/issues/411): Add a `--stats` flag, which emits aggregate statistics after search results. +* [FEATURE #702](https://github.com/BurntSushi/ripgrep/issues/702): + Support `\u{..}` Unicode escape sequences. * [FEATURE #812](https://github.com/BurntSushi/ripgrep/issues/812): Add `-b/--byte-offset` flag that reports byte offset of each matching line. * [FEATURE #814](https://github.com/BurntSushi/ripgrep/issues/814): @@ -29,12 +34,19 @@ Bug fixes: * [BUG #135](https://github.com/BurntSushi/ripgrep/issues/135): Release portable binaries that conditionally use SSSE3, AVX2, etc., at runtime. +* [BUG #268](https://github.com/BurntSushi/ripgrep/issues/268): + Print descriptive error message when trying to use look-around or + backreferences. +* [BUG #395](https://github.com/BurntSushi/ripgrep/issues/395): + Show comprehensible error messages for regexes like `\s*{`. * [BUG #526](https://github.com/BurntSushi/ripgrep/issues/526): Support backslash escapes in globs. * [BUG #832](https://github.com/BurntSushi/ripgrep/issues/832): Clarify usage instructions for `-f/--file` flag. * [BUG #852](https://github.com/BurntSushi/ripgrep/issues/852): Be robust with respect to `ENOMEM` errors returned by `mmap`. +* [BUG #853](https://github.com/BurntSushi/ripgrep/issues/853): + Upgrade `grep` crate to `regex-syntax 0.5.0`. 0.8.1 (2018-02-20) diff --git a/Cargo.lock b/Cargo.lock index 40c2e3a7e..4fe02d285 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -109,7 +109,7 @@ dependencies = [ "log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)", "memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)", "regex 0.2.9 (registry+https://github.com/rust-lang/crates.io-index)", - "regex-syntax 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -212,19 +212,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "aho-corasick 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)", - "regex-syntax 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", "thread_local 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", "utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "regex-syntax" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "regex-syntax" -version = "0.5.2" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "ucd-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -401,8 +396,7 @@ dependencies = [ "checksum redox_syscall 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)" = "0d92eecebad22b767915e4d529f89f28ee96dbbf5a4810d2b844373f136417fd" "checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76" "checksum regex 0.2.9 (registry+https://github.com/rust-lang/crates.io-index)" = "bde64a9b799f85750f6469fd658cff5fce8d910a7d510858a1f9d15ca9f023bf" -"checksum regex-syntax 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "8e931c58b93d86f080c734bfd2bce7dd0079ae2331235818133c8be7f422e20e" -"checksum regex-syntax 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d8337992c33b62cf49462a1ce4d0eedbb021f48de017e40ec307cca445df0ca9" +"checksum regex-syntax 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "b2550876c31dc914696a6c2e01cbce8afba79a93c8ae979d2fe051c0230b3756" "checksum same-file 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "cfb6eded0b06a0b512c8ddbcf04089138c9b4362c2f696f3c3d76039d68f3637" "checksum simd 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3dd0805c7363ab51a829a1511ad24b6ed0349feaa756c4bc2f977f9f496e6673" "checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550" diff --git a/grep/Cargo.toml b/grep/Cargo.toml index 541d3260e..b404a86a8 100644 --- a/grep/Cargo.toml +++ b/grep/Cargo.toml @@ -16,4 +16,4 @@ license = "Unlicense/MIT" log = "0.4" memchr = "2" regex = "0.2.9" -regex-syntax = "0.4.0" +regex-syntax = "0.5.3" diff --git a/grep/src/literals.rs b/grep/src/literals.rs index eebeac4c7..3e1c385bc 100644 --- a/grep/src/literals.rs +++ b/grep/src/literals.rs @@ -10,10 +10,8 @@ principled. use std::cmp; use regex::bytes::RegexBuilder; -use syntax::{ - Expr, Literals, Lit, - ByteClass, ByteRange, CharClass, ClassRange, Repeater, -}; +use syntax::hir::{self, Hir, HirKind}; +use syntax::hir::literal::{Literal, Literals}; #[derive(Clone, Debug)] pub struct LiteralSets { @@ -23,12 +21,12 @@ pub struct LiteralSets { } impl LiteralSets { - pub fn create(expr: &Expr) -> Self { + pub fn create(expr: &Hir) -> Self { let mut required = Literals::empty(); union_required(expr, &mut required); LiteralSets { - prefixes: expr.prefixes(), - suffixes: expr.suffixes(), + prefixes: Literals::prefixes(expr), + suffixes: Literals::suffixes(expr), required: required, } } @@ -93,60 +91,52 @@ impl LiteralSets { } } -fn union_required(expr: &Expr, lits: &mut Literals) { - use syntax::Expr::*; - match *expr { - Literal { ref chars, casei: false } => { - let s: String = chars.iter().cloned().collect(); - lits.cross_add(s.as_bytes()); +fn union_required(expr: &Hir, lits: &mut Literals) { + match *expr.kind() { + HirKind::Literal(hir::Literal::Unicode(c)) => { + let mut buf = [0u8; 4]; + lits.cross_add(c.encode_utf8(&mut buf).as_bytes()); } - Literal { ref chars, casei: true } => { - for &c in chars { - let cls = CharClass::new(vec![ - ClassRange { start: c, end: c }, - ]).case_fold(); - if !lits.add_char_class(&cls) { - lits.cut(); - return; - } + HirKind::Literal(hir::Literal::Byte(b)) => { + lits.cross_add(&[b]); + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + if count_unicode_class(cls) >= 5 || !lits.add_char_class(cls) { + lits.cut(); + } + } + HirKind::Class(hir::Class::Bytes(ref cls)) => { + if count_byte_class(cls) >= 5 || !lits.add_byte_class(cls) { + lits.cut(); } } - LiteralBytes { ref bytes, casei: false } => { - lits.cross_add(bytes); + HirKind::Group(hir::Group { ref hir, .. }) => { + union_required(&**hir, lits); } - LiteralBytes { ref bytes, casei: true } => { - for &b in bytes { - let cls = ByteClass::new(vec![ - ByteRange { start: b, end: b }, - ]).case_fold(); - if !lits.add_byte_class(&cls) { + HirKind::Repetition(ref x) => { + match x.kind { + hir::RepetitionKind::ZeroOrOne => lits.cut(), + hir::RepetitionKind::ZeroOrMore => lits.cut(), + hir::RepetitionKind::OneOrMore => { + union_required(&x.hir, lits); lits.cut(); - return; + } + hir::RepetitionKind::Range(ref rng) => { + let (min, max) = match *rng { + hir::RepetitionRange::Exactly(m) => (m, Some(m)), + hir::RepetitionRange::AtLeast(m) => (m, None), + hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), + }; + repeat_range_literals( + &x.hir, min, max, x.greedy, lits, union_required); } } } - Class(_) => { - lits.cut(); - } - ClassBytes(_) => { - lits.cut(); + HirKind::Concat(ref es) if es.is_empty() => {} + HirKind::Concat(ref es) if es.len() == 1 => { + union_required(&es[0], lits) } - Group { ref e, .. } => { - union_required(&**e, lits); - } - Repeat { r: Repeater::ZeroOrOne, .. } => lits.cut(), - Repeat { r: Repeater::ZeroOrMore, .. } => lits.cut(), - Repeat { ref e, r: Repeater::OneOrMore, .. } => { - union_required(&**e, lits); - lits.cut(); - } - Repeat { ref e, r: Repeater::Range { min, max }, greedy } => { - repeat_range_literals( - &**e, min, max, greedy, lits, union_required); - } - Concat(ref es) if es.is_empty() => {} - Concat(ref es) if es.len() == 1 => union_required(&es[0], lits), - Concat(ref es) => { + HirKind::Concat(ref es) => { for e in es { let mut lits2 = lits.to_empty(); union_required(e, &mut lits2); @@ -157,7 +147,6 @@ fn union_required(expr: &Expr, lits: &mut Literals) { if lits2.contains_empty() { lits.cut(); } - // if !lits.union(lits2) { if !lits.cross_product(&lits2) { // If this expression couldn't yield any literal that // could be extended, then we need to quit. Since we're @@ -167,15 +156,15 @@ fn union_required(expr: &Expr, lits: &mut Literals) { } } } - Alternate(ref es) => { + HirKind::Alternation(ref es) => { alternate_literals(es, lits, union_required); } _ => lits.cut(), } } -fn repeat_range_literals( - e: &Expr, +fn repeat_range_literals( + e: &Hir, min: u32, max: Option, _greedy: bool, @@ -204,8 +193,8 @@ fn repeat_range_literals( } } -fn alternate_literals( - es: &[Expr], +fn alternate_literals( + es: &[Hir], lits: &mut Literals, mut f: F, ) { @@ -234,11 +223,21 @@ fn alternate_literals( } lits.cut(); if !lcs.is_empty() { - lits.add(Lit::empty()); - lits.add(Lit::new(lcs.to_vec())); + lits.add(Literal::empty()); + lits.add(Literal::new(lcs.to_vec())); } } +/// Return the number of characters in the given class. +fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 { + cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum() +} + +/// Return the number of bytes in the given class. +fn count_byte_class(cls: &hir::ClassBytes) -> u32 { + cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum() +} + /// Converts an arbitrary sequence of bytes to a literal suitable for building /// a regular expression. fn bytes_to_regex(bs: &[u8]) -> String { diff --git a/grep/src/nonl.rs b/grep/src/nonl.rs index 361b0b003..3beb5f61c 100644 --- a/grep/src/nonl.rs +++ b/grep/src/nonl.rs @@ -1,4 +1,4 @@ -use syntax::Expr; +use syntax::hir::{self, Hir, HirKind}; use {Error, Result}; @@ -9,59 +9,66 @@ use {Error, Result}; /// /// If `byte` is not an ASCII character (i.e., greater than `0x7F`), then this /// function panics. -pub fn remove(expr: Expr, byte: u8) -> Result { - // TODO(burntsushi): There is a bug in this routine where only `\n` is - // handled correctly. Namely, `AnyChar` and `AnyByte` need to be translated - // to proper character classes instead of the special `AnyCharNoNL` and - // `AnyByteNoNL` classes. - use syntax::Expr::*; +pub fn remove(expr: Hir, byte: u8) -> Result { assert!(byte <= 0x7F); let chr = byte as char; assert!(chr.len_utf8() == 1); - Ok(match expr { - Literal { chars, casei } => { - if chars.iter().position(|&c| c == chr).is_some() { + Ok(match expr.into_kind() { + HirKind::Empty => Hir::empty(), + HirKind::Literal(hir::Literal::Unicode(c)) => { + if c == chr { return Err(Error::LiteralNotAllowed(chr)); } - Literal { chars: chars, casei: casei } + Hir::literal(hir::Literal::Unicode(c)) } - LiteralBytes { bytes, casei } => { - if bytes.iter().position(|&b| b == byte).is_some() { + HirKind::Literal(hir::Literal::Byte(b)) => { + if b as char == chr { return Err(Error::LiteralNotAllowed(chr)); } - LiteralBytes { bytes: bytes, casei: casei } + Hir::literal(hir::Literal::Byte(b)) } - AnyChar => AnyCharNoNL, - AnyByte => AnyByteNoNL, - Class(mut cls) => { - cls.remove(chr); - Class(cls) - } - ClassBytes(mut cls) => { - cls.remove(byte); - ClassBytes(cls) - } - Group { e, i, name } => { - Group { - e: Box::new(remove(*e, byte)?), - i: i, - name: name, + HirKind::Class(hir::Class::Unicode(mut cls)) => { + let remove = hir::ClassUnicode::new(Some( + hir::ClassUnicodeRange::new(chr, chr), + )); + cls.difference(&remove); + if cls.iter().next().is_none() { + return Err(Error::LiteralNotAllowed(chr)); } + Hir::class(hir::Class::Unicode(cls)) } - Repeat { e, r, greedy } => { - Repeat { - e: Box::new(remove(*e, byte)?), - r: r, - greedy: greedy, + HirKind::Class(hir::Class::Bytes(mut cls)) => { + let remove = hir::ClassBytes::new(Some( + hir::ClassBytesRange::new(byte, byte), + )); + cls.difference(&remove); + if cls.iter().next().is_none() { + return Err(Error::LiteralNotAllowed(chr)); } + Hir::class(hir::Class::Bytes(cls)) + } + HirKind::Anchor(x) => Hir::anchor(x), + HirKind::WordBoundary(x) => Hir::word_boundary(x), + HirKind::Repetition(mut x) => { + x.hir = Box::new(remove(*x.hir, byte)?); + Hir::repetition(x) + } + HirKind::Group(mut x) => { + x.hir = Box::new(remove(*x.hir, byte)?); + Hir::group(x) } - Concat(exprs) => { - Concat(exprs.into_iter().map(|e| remove(e, byte)).collect::>>()?) + HirKind::Concat(xs) => { + let xs = xs.into_iter() + .map(|e| remove(e, byte)) + .collect::>>()?; + Hir::concat(xs) } - Alternate(exprs) => { - Alternate(exprs.into_iter().map(|e| remove(e, byte)).collect::>>()?) + HirKind::Alternation(xs) => { + let xs = xs.into_iter() + .map(|e| remove(e, byte)) + .collect::>>()?; + Hir::alternation(xs) } - e => e, }) } diff --git a/grep/src/search.rs b/grep/src/search.rs index 8d056796a..1d5d7e29c 100644 --- a/grep/src/search.rs +++ b/grep/src/search.rs @@ -1,10 +1,10 @@ use memchr::{memchr, memrchr}; use regex::bytes::{Regex, RegexBuilder}; -use syntax; use literals::LiteralSets; use nonl; -use syntax::Expr; +use syntax::ParserBuilder; +use syntax::hir::Hir; use word_boundary::strip_unicode_word_boundaries; use Result; @@ -166,7 +166,7 @@ impl GrepBuilder { /// Creates a new regex from the given expression with the current /// configuration. - fn regex(&self, expr: &Expr) -> Result { + fn regex(&self, expr: &Hir) -> Result { let mut builder = RegexBuilder::new(&expr.to_string()); builder.unicode(true); self.regex_build(builder) @@ -184,15 +184,16 @@ impl GrepBuilder { /// Parses the underlying pattern and ensures the pattern can never match /// the line terminator. - fn parse(&self) -> Result { - let expr = - syntax::ExprBuilder::new() - .allow_bytes(true) - .unicode(true) + fn parse(&self) -> Result { + let expr = ParserBuilder::new() + .allow_invalid_utf8(true) .case_insensitive(self.is_case_insensitive()?) + .multi_line(true) + .build() .parse(&self.pattern)?; + debug!("original regex HIR pattern:\n{}", expr); let expr = nonl::remove(expr, self.opts.line_terminator)?; - debug!("regex ast:\n{:#?}", expr); + debug!("transformed regex HIR pattern:\n{}", expr); Ok(expr) } diff --git a/grep/src/word_boundary.rs b/grep/src/word_boundary.rs index 6df5c6574..8e6b86d12 100644 --- a/grep/src/word_boundary.rs +++ b/grep/src/word_boundary.rs @@ -1,4 +1,4 @@ -use syntax::Expr; +use syntax::hir::{self, Hir, HirKind}; /// Strips Unicode word boundaries from the given expression. /// @@ -8,7 +8,7 @@ use syntax::Expr; /// false negatives. /// /// If no word boundaries could be stripped, then None is returned. -pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option { +pub fn strip_unicode_word_boundaries(expr: &Hir) -> Option { // The real reason we do this is because Unicode word boundaries are the // one thing that Rust's regex DFA engine can't handle. When it sees a // Unicode word boundary among non-ASCII text, it falls back to one of the @@ -16,23 +16,24 @@ pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option { // a regex to find candidate matches without a Unicode word boundary. We'll // only then use the full (and slower) regex to confirm a candidate as a // match or not during search. - use syntax::Expr::*; - - match *expr { - Concat(ref es) if !es.is_empty() => { + // + // It looks like we only check the outer edges for `\b`? I guess this is + // an attempt to optimize for the `-w/--word-regexp` flag? ---AG + match *expr.kind() { + HirKind::Concat(ref es) if !es.is_empty() => { let first = is_unicode_word_boundary(&es[0]); let last = is_unicode_word_boundary(es.last().unwrap()); // Be careful not to strip word boundaries if there are no other // expressions to match. match (first, last) { (true, false) if es.len() > 1 => { - Some(Concat(es[1..].to_vec())) + Some(Hir::concat(es[1..].to_vec())) } (false, true) if es.len() > 1 => { - Some(Concat(es[..es.len() - 1].to_vec())) + Some(Hir::concat(es[..es.len() - 1].to_vec())) } (true, true) if es.len() > 2 => { - Some(Concat(es[1..es.len() - 1].to_vec())) + Some(Hir::concat(es[1..es.len() - 1].to_vec())) } _ => None, } @@ -42,13 +43,11 @@ pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option { } /// Returns true if the given expression is a Unicode word boundary. -fn is_unicode_word_boundary(expr: &Expr) -> bool { - use syntax::Expr::*; - - match *expr { - WordBoundary => true, - NotWordBoundary => true, - Group { ref e, .. } => is_unicode_word_boundary(e), +fn is_unicode_word_boundary(expr: &Hir) -> bool { + match *expr.kind() { + HirKind::WordBoundary(hir::WordBoundary::Unicode) => true, + HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => true, + HirKind::Group(ref x) => is_unicode_word_boundary(&x.hir), _ => false, } } diff --git a/src/args.rs b/src/args.rs index c7b31dd36..2bbabf1a3 100644 --- a/src/args.rs +++ b/src/args.rs @@ -9,7 +9,7 @@ use std::sync::atomic::{AtomicBool, Ordering}; use clap; use encoding_rs::Encoding; -use grep::{Grep, GrepBuilder, Error as GrepError}; +use grep::{Grep, GrepBuilder}; use log; use num_cpus; use regex; @@ -882,16 +882,7 @@ impl<'a> ArgMatches<'a> { if let Some(limit) = self.regex_size_limit()? { gb = gb.size_limit(limit); } - gb.build().map_err(|err| { - match err { - GrepError::Regex(err) => { - let s = format!("{}\n(Hint: Try the --fixed-strings flag \ - to search for a literal string.)", err.to_string()); - From::from(s) - }, - err => From::from(err) - } - }) + Ok(gb.build()?) } /// Builds the set of glob overrides from the command line flags. diff --git a/tests/tests.rs b/tests/tests.rs index 8cb871843..855f2484d 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -1687,16 +1687,6 @@ sherlock!(feature_419_zero_as_shortcut_for_null, "Sherlock", ".", assert_eq!(lines, "sherlock\x002\n"); }); -// See: https://github.com/BurntSushi/ripgrep/issues/709 -clean!(suggest_fixed_strings_for_invalid_regex, "foo(", ".", -|wd: WorkDir, mut cmd: Command| { - wd.assert_non_empty_stderr(&mut cmd); - - let output = cmd.output().unwrap(); - let err = String::from_utf8_lossy(&output.stderr); - assert_eq!(err.contains("--fixed-strings"), true); -}); - #[test] fn compressed_gzip() { if !cmd_exists("gzip") {