diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 1096e9f05..a4b7bc4ac 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -243,6 +243,7 @@ impl Hir { info.set_match_empty(true); info.set_literal(false); info.set_alternation_literal(false); + info.static_capture_count = Some(0); Hir { kind: HirKind::Empty, info } } @@ -268,6 +269,7 @@ impl Hir { info.set_match_empty(false); info.set_literal(true); info.set_alternation_literal(true); + info.static_capture_count = Some(0); Hir { kind: HirKind::Literal(lit), info } } @@ -285,6 +287,7 @@ impl Hir { info.set_match_empty(false); info.set_literal(false); info.set_alternation_literal(false); + info.static_capture_count = Some(0); Hir { kind: HirKind::Class(class), info } } @@ -318,6 +321,7 @@ impl Hir { if let Anchor::EndLine = anchor { info.set_line_anchored_end(true); } + info.static_capture_count = Some(0); Hir { kind: HirKind::Anchor(anchor), info } } @@ -345,6 +349,7 @@ impl Hir { if let WordBoundary::AsciiNegate = word_boundary { info.set_always_utf8(false); } + info.static_capture_count = Some(0); Hir { kind: HirKind::WordBoundary(word_boundary), info } } @@ -372,6 +377,28 @@ impl Hir { info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty()); info.set_literal(false); info.set_alternation_literal(false); + info.static_capture_count = { + use RepetitionRange::{AtLeast, Bounded, Exactly}; + match (&rep.kind, rep.hir.info.static_capture_count) { + // Always zero. + (_, Some(0)) => Some(0), + (RepetitionKind::Range(Exactly(0)), _) => Some(0), + (RepetitionKind::Range(Bounded(_, 0)), _) => Some(0), + + // Impossible to know statically. + (_, None) => None, + (RepetitionKind::ZeroOrOne, _) => None, + (RepetitionKind::ZeroOrMore, _) => None, + (RepetitionKind::Range(AtLeast(0)), _) => None, + (RepetitionKind::Range(Bounded(0, _)), _) => None, + + // Guaranteed to be static. + (RepetitionKind::OneOrMore, Some(n)) => Some(n), + (RepetitionKind::Range(Exactly(_)), Some(n)) => Some(n), + (RepetitionKind::Range(AtLeast(_)), Some(n)) => Some(n), + (RepetitionKind::Range(Bounded(_, _)), Some(n)) => Some(n), + } + }; Hir { kind: HirKind::Repetition(rep), info } } @@ -389,6 +416,12 @@ impl Hir { info.set_match_empty(group.hir.is_match_empty()); info.set_literal(false); info.set_alternation_literal(false); + info.static_capture_count = match group.kind { + GroupKind::NonCapturing { .. } => { + group.hir.info.static_capture_count + } + _ => group.hir.info.static_capture_count.map(|n| n + 1), + }; Hir { kind: HirKind::Group(group), info } } @@ -480,6 +513,10 @@ impl Hir { }) .any(|e| e.is_line_anchored_end()), ); + info.static_capture_count = + exprs.iter().fold(Some(0), |cnt, e| { + Some(cnt? + e.info.static_capture_count?) + }); Hir { kind: HirKind::Concat(exprs), info } } } @@ -542,6 +579,11 @@ impl Hir { let x = info.is_alternation_literal() && e.is_literal(); info.set_alternation_literal(x); } + let mut capture_counts = + exprs.iter().map(|e| e.info.static_capture_count); + let first = capture_counts.next().unwrap_or(Some(0)); + info.static_capture_count = capture_counts + .fold(first, |a, b| if a == b { a } else { None }); Hir { kind: HirKind::Alternation(exprs), info } } } @@ -692,6 +734,13 @@ impl Hir { pub fn is_alternation_literal(&self) -> bool { self.info.is_alternation_literal() } + + /// Returns the number of captures groups that would participate in a + /// successful match of this expression. If this number can not be + /// statically determined from the regex this function returns `None`. + pub fn participating_captures_len(&self) -> Option { + self.info.static_capture_count.map(|c| c as usize) + } } impl HirKind { @@ -1484,6 +1533,11 @@ struct HirInfo { /// If more attributes need to be added, it is OK to increase the size of /// this as appropriate. bools: u16, + + /// How many capture groups this HIR expression deterministically fills. + /// If this number could depend on the input (e.g. an Alternation where the + /// two branches have a different number of capture groups), this is None. + static_capture_count: Option, } // A simple macro for defining bitfield accessors/mutators. @@ -1505,7 +1559,7 @@ macro_rules! define_bool { impl HirInfo { fn new() -> HirInfo { - HirInfo { bools: 0 } + HirInfo { bools: 0, static_capture_count: None } } define_bool!(0, is_always_utf8, set_always_utf8); diff --git a/src/compile.rs b/src/compile.rs index 90ca25015..c7bc66c0b 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -149,6 +149,8 @@ impl Compiler { self.compiled.start = dotstar_patch.entry; } self.compiled.captures = vec![None]; + self.compiled.participating_captures_len = + expr.participating_captures_len(); let patch = self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); if self.compiled.needs_dotstar() { diff --git a/src/exec.rs b/src/exec.rs index e75ca083a..51cbd05a3 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -70,7 +70,7 @@ struct ExecReadOnly { /// A compiled program that is used in the NFA simulation and backtracking. /// It can be byte-based or Unicode codepoint based. /// - /// N.B. It is not possibly to make this byte-based from the public API. + /// N.B. It is not possible to make this byte-based from the public API. /// It is only used for testing byte based programs in the NFA simulations. nfa: Program, /// A compiled byte based program for DFA execution. This is only used @@ -1311,6 +1311,13 @@ impl Exec { pub fn capture_name_idx(&self) -> &Arc> { &self.ro.nfa.capture_name_idx } + + /// Returns the number of participating captures that this regex will + /// return on a successful match. If this number can not be statically + /// determined from the regex this function returns `None`. + pub fn participating_captures_len(&self) -> Option { + self.ro.nfa.participating_captures_len + } } impl Clone for Exec { diff --git a/src/prog.rs b/src/prog.rs index c211f71d8..c29bd8458 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -12,7 +12,7 @@ use crate::literal::LiteralSearcher; /// `InstPtr` represents the index of an instruction in a regex program. pub type InstPtr = usize; -/// Program is a sequence of instructions and various facts about thos +/// Program is a sequence of instructions and various facts about those /// instructions. #[derive(Clone)] pub struct Program { @@ -25,6 +25,9 @@ pub struct Program { /// The ordered sequence of all capture groups extracted from the AST. /// Unnamed groups are `None`. pub captures: Vec>, + /// The number of capture groups that participate in a successful match. + /// None if this can't be determined statically at compile time. + pub participating_captures_len: Option, /// Pointers to all named capture groups into `captures`. pub capture_name_idx: Arc>, /// A pointer to the start instruction. This can vary depending on how @@ -82,6 +85,7 @@ impl Program { insts: vec![], matches: vec![], captures: vec![], + participating_captures_len: None, capture_name_idx: Arc::new(HashMap::new()), start: 0, byte_classes: vec![0; 256], diff --git a/src/re_bytes.rs b/src/re_bytes.rs index d71969257..cd1594773 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -667,6 +667,13 @@ impl Regex { self.0.capture_names().len() } + /// Returns the number of participating captures that this regex will + /// return on a successful match. If this number can not be statically + /// determined from the regex this function returns `None`. + pub fn participating_captures_len(&self) -> Option { + self.0.participating_captures_len() + } + /// Returns an empty set of capture locations that can be reused in /// multiple calls to `captures_read` or `captures_read_at`. pub fn capture_locations(&self) -> CaptureLocations { diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 60d81a7d9..29ac6b218 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -725,6 +725,13 @@ impl Regex { self.0.capture_names().len() } + /// Returns the number of participating captures that this regex will + /// return on a successful match. If this number can not be statically + /// determined from the regex this function returns `None`. + pub fn participating_captures_len(&self) -> Option { + self.0.participating_captures_len() + } + /// Returns an empty set of capture locations that can be reused in /// multiple calls to `captures_read` or `captures_read_at`. pub fn capture_locations(&self) -> CaptureLocations { diff --git a/tests/api.rs b/tests/api.rs index c7250a8a3..4dc20c4b9 100644 --- a/tests/api.rs +++ b/tests/api.rs @@ -128,6 +128,47 @@ fn capture_index_lifetime() { assert_eq!(3, inner("123")); } +#[test] +fn participating_captures_len() { + let tests = [ + ("", Some(0)), + ("foo|bar", Some(0)), + ("(foo)|bar", None), + ("foo|(bar)", None), + ("(foo|bar)", Some(1)), + ("(a|b|c|d|e|f)", Some(1)), + ("(a)|(b)|(c)|(d)|(e)|(f)", Some(1)), + ("(a)(b)|(c)(d)|(e)(f)", Some(2)), + ("(a)(b)(c)|(d)(e)(f)", Some(3)), + ("(a)(b)(c)(d)(e)(f)", Some(6)), + ("(a)(b)(extra)|(a)(b)()", Some(3)), + ("(a)(b)((?:extra)?)", Some(3)), + ("(a)(b)(extra)?", None), + ("(foo)|(bar)", Some(1)), + ("(foo)(bar)", Some(2)), + ("(foo)+(bar)", Some(2)), + ("(foo)*(bar)", None), + ("(foo)?{0}", Some(0)), + ("(foo)?{1}", None), + ("(foo){1}", Some(1)), + ("(foo){1,}", Some(1)), + ("(foo){1,}?", Some(1)), + ("(foo){0,}", None), + ("(foo)(?:bar)", Some(1)), + ("(foo(?:bar)+)(?:baz(boo))", Some(2)), + ("(?Pfoo)(?:bar)(bal|loon)", Some(2)), + (r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#, Some(2)), + ]; + for (test_regex, expected) in tests { + let re = regex!(test_regex); + assert_eq!( + re.participating_captures_len(), + expected, + "for regex {test_regex}" + ); + } +} + #[test] fn capture_misc() { let re = regex!(r"(.)(?Pa)?(.)(?P.)");