Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(regular_expression)!: Support ES2025 Duplicated named capture groups #6847

Merged
merged 1 commit into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions crates/oxc_linter/src/snapshots/no_invalid_regexp.snap
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ source: crates/oxc_linter/src/tester.rs
╰────

⚠ eslint(no-invalid-regexp): Invalid regular expression: Duplicated capturing group names
╭─[no_invalid_regexp.tsx:1:23]
╭─[no_invalid_regexp.tsx:1:16]
1 │ new RegExp('(?<k>a)(?<k>b)')
·
·
╰────
5 changes: 5 additions & 0 deletions crates/oxc_regular_expression/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,8 @@ Implements ECMAScript® 2024 Language Specification
- https://tc39.es/ecma262/2024/multipage/ecmascript-language-lexical-grammar.html#sec-literals-regular-expression-literals
- https://tc39.es/ecma262/2024/multipage/text-processing.html#sec-regexp-regular-expression-objects
- https://tc39.es/ecma262/2024/multipage/additional-ecmascript-features-for-web-browsers.html#sec-regular-expressions-patterns

And, Stage 3 proposals

- https://github.com/tc39/proposal-duplicate-named-capturing-groups
- https://github.com/tc39/proposal-regexp-modifiers
32 changes: 25 additions & 7 deletions crates/oxc_regular_expression/src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,17 @@ mod test {
(r"\1()", "u"),
(r"(?<n1>..)(?<n2>..)", ""),
// ES2025 ---
// TODO: Duplicate named capturing groups
// (r"(?<n1>..)|(?<n1>..)", ""),
// (r"(?<year>[0-9]{4})-[0-9]{2}|[0-9]{2}-(?<year>[0-9]{4})", ""),
// (r"(?:(?<a>x)|(?<a>y))\k<a>", ""),
// Duplicate named capturing groups
(r"(?<n1>..)|(?<n1>..)", ""),
(r"(?<year>[0-9]{4})-[0-9]{2}|[0-9]{2}-(?<year>[0-9]{4})", ""),
(r"(?:(?<a>x)|(?<a>y))\k<a>", ""),
(r"(?<x>a)|(?<x>b)", ""),
(r"(?:(?<x>a)|(?<y>a)(?<x>b))(?:(?<z>c)|(?<z>d))", ""),
(r"(?:(?<x>a)|(?<x>b))\\k<x>", ""),
(r"(?:(?:(?<x>a)|(?<x>b)|c)\\k<x>){2}", ""),
(r"(?:(?:(?<x>a)|(?<x>b))\\k<x>){2}", ""),
(r"(?:(?:(?<x>a)\\k<x>|(?<x>b)\\k<x>)|(?:))\\k<x>", ""),
(r"(?:(?:(?<x>a\\k<x>)|(?<x>b\\k<x>))|(?:))\\k<x>", ""),
// Modifiers
(r"(?:.)", ""),
(r"(?s:.)", ""),
Expand Down Expand Up @@ -190,9 +197,15 @@ mod test {
(r"[\q{]", "v"),
(r"[\q{\a}]", "v"),
// ES2025 ---
// TODO: Duplicate named capturing groups
(r"(?<n>..)|(?<n>..)", ""), // This will be valid
// (r"(?<a>|(?<a>))", ""), // Nested, still invalid
// Duplicate named capturing groups
(r"(?<n>.)(?<n>.)", ""),
(r"(?<n>.(?<n>..))", "u"),
("(?<n>)|(?<n>)(?<n>)", ""),
("(((((((?<n>.)))))))(?<n>)", ""),
("(?:(?<x>a)|(?<x>b))(?<x>c)", ""),
("(?<x>a)(?:(?<x>b)|(?<x>c))", ""),
("(?:(?:(?<x>a)|(?<x>b)))(?<x>c)", ""),
("(?:(?:(?<x>a)|(?<x>b))|(?:))(?<x>c)", ""),
// Modifiers
(r"(?a:.)", ""),
(r"(?-S:.)", ""),
Expand Down Expand Up @@ -255,6 +268,11 @@ mod test {
(r"[[[[[^[[[[\q{ng}]]]]]]]]]", "v", true),
(r"[^[[[[[[[[[[[[[[[[\q{ng}]]]]]]]]]]]]]]]]]", "v", true),
// ES2025 ---
// Duplicated named capture groups
("(?:(?<x>a)|(?<x>b))(?<x>c)", "", true),
("(?:(?<x>a)|(?<x>b))(?<X>c)", "", false),
("(?<x>a)(?:(?<x>b)|(?<x>c))", "", true),
("(?<x>a)|(?:(?<x>b)|(?<x>c))", "", false),
// Modifiers
(r"(?ii:.)", "", true),
(r"(?-ss:.)", "", true),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,14 @@ impl<'a> PatternParser<'a> {
// - Cons: 1st pass is completely useless if the pattern does not contain any capturing groups
// We may re-consider this if we need more performance rather than simplicity.
let checkpoint = self.reader.checkpoint();
let duplicated_named_capturing_groups =
self.state.initialize_with_parsing(&mut self.reader);

// [SS:EE] Pattern :: Disjunction
// It is a Syntax Error if Pattern contains two or more GroupSpecifiers for which the CapturingGroupName of GroupSpecifier is the same.
self.state.initialize_with_parsing(&mut self.reader).map_err(|offsets| {
diagnostics::duplicated_capturing_group_names(
offsets.iter().map(|&(start, end)| self.span_factory.create(start, end)).collect(),
)
})?;
self.reader.rewind(checkpoint);

// [SS:EE] Pattern :: Disjunction
Expand All @@ -58,16 +64,6 @@ impl<'a> PatternParser<'a> {
if u32::MAX == self.state.num_of_capturing_groups {
return Err(diagnostics::too_may_capturing_groups(self.span_factory.create(0, 0)));
}
// [SS:EE] Pattern :: Disjunction
// It is a Syntax Error if Pattern contains two or more GroupSpecifiers for which the CapturingGroupName of GroupSpecifier is the same.
if !duplicated_named_capturing_groups.is_empty() {
return Err(diagnostics::duplicated_capturing_group_names(
duplicated_named_capturing_groups
.iter()
.map(|&(start, end)| self.span_factory.create(start, end))
.collect(),
));
}

// Let's start parsing!
let span_start = self.reader.offset();
Expand Down
184 changes: 138 additions & 46 deletions crates/oxc_regular_expression/src/parser/pattern_parser/state.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use oxc_span::Atom;
use rustc_hash::FxHashSet;
use rustc_hash::{FxHashMap, FxHashSet};

use crate::parser::reader::Reader;

Expand All @@ -16,6 +16,8 @@ pub struct State<'a> {
pub capturing_group_names: FxHashSet<Atom<'a>>,
}

type DuplicatedNamedCapturingGroupOffsets = Vec<(u32, u32)>;

impl<'a> State<'a> {
pub fn new(unicode_mode: bool, unicode_sets_mode: bool) -> Self {
Self {
Expand All @@ -27,12 +29,11 @@ impl<'a> State<'a> {
}
}

pub fn initialize_with_parsing(&mut self, reader: &mut Reader<'a>) -> Vec<(u32, u32)> {
let (
num_of_left_capturing_parens,
capturing_group_names,
duplicated_named_capturing_groups,
) = parse_capturing_groups(reader);
pub fn initialize_with_parsing(
&mut self,
reader: &mut Reader<'a>,
) -> Result<(), DuplicatedNamedCapturingGroupOffsets> {
let (num_of_left_capturing_parens, capturing_group_names) = parse_capturing_groups(reader)?;

// In Annex B, this is `false` by default.
// It is `true`
Expand All @@ -44,26 +45,43 @@ impl<'a> State<'a> {
self.num_of_capturing_groups = num_of_left_capturing_parens;
self.capturing_group_names = capturing_group_names;

duplicated_named_capturing_groups
Ok(())
}
}

/// Returns: (num_of_left_parens, capturing_group_names, duplicated_named_capturing_groups)
enum SimpleUnit<'a> {
Open,
Close,
Pipe,
GroupName(Atom<'a>),
}

/// Returns: Result<(num_of_left_parens, capturing_group_names), duplicated_named_capturing_group_offsets>
fn parse_capturing_groups<'a>(
reader: &mut Reader<'a>,
) -> (u32, FxHashSet<Atom<'a>>, Vec<(u32, u32)>) {
let mut num_of_left_capturing_parens = 0;
let mut capturing_group_names = FxHashSet::default();
let mut duplicated_named_capturing_groups = vec![];

let mut in_escape = false;
let mut in_character_class = false;

) -> Result<(u32, FxHashSet<Atom<'a>>), DuplicatedNamedCapturingGroupOffsets> {
// Count only normal CapturingGroup(named, unnamed)
// (?<name>...), (...)
// IgnoreGroup, and LookaroundAssertions are ignored
// (?:...)
// (?=...), (?!...), (?<=...), (?<!...)
let mut num_of_left_capturing_parens = 0;

// Collect capturing group names
let mut group_names: FxHashMap<Atom<'a>, (u32, u32)> = FxHashMap::default();
// At the same time, check duplicates
// If you want to process this most efficiently:
// - define a scope for each Disjunction
// - then check for duplicates for each `|` while inheriting the parent-child relationship
// ref. https://source.chromium.org/chromium/chromium/src/+/main:v8/src/regexp/regexp-parser.cc;l=1644
// However, duplicates are rare in the first place.
// And as long as it works simply, this may be enough.
let mut may_duplicates: FxHashMap<Atom<'a>, DuplicatedNamedCapturingGroupOffsets> =
FxHashMap::default();
let mut simplified: Vec<SimpleUnit<'a>> = vec![];

let mut in_escape = false;
let mut in_character_class = false;
while let Some(cp) = reader.peek() {
if in_escape {
in_escape = false;
Expand All @@ -73,9 +91,15 @@ fn parse_capturing_groups<'a>(
in_character_class = true;
} else if cp == ']' as u32 {
in_character_class = false;
} else if !in_character_class && cp == '|' as u32 {
simplified.push(SimpleUnit::Pipe);
} else if !in_character_class && cp == ')' as u32 {
simplified.push(SimpleUnit::Close);
} else if !in_character_class && cp == '(' as u32 {
reader.advance();

simplified.push(SimpleUnit::Open);

// Skip IgnoreGroup
if reader.eat2('?', ':')
// Skip LookAroundAssertion
Expand All @@ -102,56 +126,124 @@ fn parse_capturing_groups<'a>(
let span_end = reader.offset();

if reader.eat('>') {
// May be duplicated
if !capturing_group_names.insert(reader.atom(span_start, span_end)) {
// Report them with `Span`
duplicated_named_capturing_groups.push((span_start, span_end));
let group_name = reader.atom(span_start, span_end);

simplified.push(SimpleUnit::GroupName(group_name.clone()));
// Check duplicates later
if let Some(last_span) = group_names.get(&group_name) {
let entry = may_duplicates.entry(group_name).or_default();
entry.push(*last_span);
entry.push((span_start, span_end));
} else {
group_names.insert(group_name, (span_start, span_end));
}

continue;
}
}

// Unnamed
continue;
}

reader.advance();
}

(num_of_left_capturing_parens, capturing_group_names, duplicated_named_capturing_groups)
// Check duplicates and emit error if exists
if !may_duplicates.is_empty() {
// Check must be done for each group name
for (group_name, spans) in may_duplicates {
let iter = simplified.iter().clone();

let mut alternative_depth = FxHashSet::default();
let mut depth = 0_u32;
let mut is_first = true;

'outer: for token in iter {
match token {
SimpleUnit::Open => {
depth += 1;
}
SimpleUnit::Close => {
// May panic if the pattern has invalid, unbalanced parens
depth = depth.saturating_sub(1);
}
SimpleUnit::Pipe => {
if !is_first {
alternative_depth.insert(depth);
}
}
SimpleUnit::GroupName(name) => {
// Check target group name only
if *name != group_name {
continue;
}
// Skip the first one, because it is not duplicated
if is_first {
is_first = false;
continue;
}

// If left outer `|` is found, both can participate
// `|(?<n>)`
// ^ ^ depth: 1
// ^ depth: 0
for i in (0..depth).rev() {
if alternative_depth.contains(&i) {
// Remove it, next duplicates requires another `|`
alternative_depth.remove(&i);
continue 'outer;
}
}

return Err(spans);
}
}
}
}
}

Ok((num_of_left_capturing_parens, group_names.keys().cloned().collect()))
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_count_capturing_groups() {
fn count_capturing_groups() {
for (source_text, expected) in [
("()", (1, 0, false)),
(r"\1()", (1, 0, false)),
("(foo)", (1, 0, false)),
("(foo)(bar)", (2, 0, false)),
("(foo(bar))", (2, 0, false)),
("(foo)[(bar)]", (1, 0, false)),
(r"(foo)\(bar\)", (1, 0, false)),
("(foo)(?<n>bar)", (2, 1, false)),
("(foo)(?=...)(?!...)(?<=...)(?<!...)(?:...)", (1, 0, false)),
("(foo)(?<n>bar)(?<nn>baz)", (3, 2, false)),
("(?<n>.)(?<n>..)", (2, 1, true)),
("(?<n>.(?<n>..))", (2, 1, true)),
("()", (1, 0)),
(r"\1()", (1, 0)),
("(foo)", (1, 0)),
("(foo)(bar)", (2, 0)),
("(foo(bar))", (2, 0)),
("(foo)[(bar)]", (1, 0)),
(r"(foo)\(bar\)", (1, 0)),
("(foo)(?<n>bar)", (2, 1)),
("(foo)(?=...)(?!...)(?<=...)(?<!...)(?:...)", (1, 0)),
("(foo)(?<n>bar)(?<nn>baz)", (3, 2)),
("(?<n>.)(?<m>.)|(?<n>..)|(?<m>.)", (4, 2)),
("(?<n>.)(?<m>.)|(?:..)|(?<m>.)", (3, 2)),
] {
let mut reader = Reader::initialize(source_text, true, false).unwrap();

let (
num_of_left_capturing_parens,
capturing_group_names,
duplicated_named_capturing_groups,
) = parse_capturing_groups(&mut reader);

let actual = (
num_of_left_capturing_parens,
capturing_group_names.len(),
!duplicated_named_capturing_groups.is_empty(),
);
let (num_of_left_capturing_parens, capturing_group_names) =
parse_capturing_groups(&mut reader).unwrap();

let actual = (num_of_left_capturing_parens, capturing_group_names.len());
assert_eq!(expected, actual, "{source_text}");
}
}

#[test]
fn duplicated_named_capturing_groups() {
for source_text in
["(?<n>.)(?<n>..)", "(?<n>.(?<n>..))", "|(?<n>.(?<n>..))", "(?<m>.)|(?<n>.(?<n>..))"]
{
let mut reader = Reader::initialize(source_text, true, false).unwrap();

assert!(parse_capturing_groups(&mut reader).is_err(), "{source_text}");
}
}
}
4 changes: 2 additions & 2 deletions tasks/coverage/snapshots/codegen_test262.snap
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
commit: 06454619

codegen_test262 Summary:
AST Parsed : 43832/43832 (100.00%)
Positive Passed: 43832/43832 (100.00%)
AST Parsed : 43851/43851 (100.00%)
Positive Passed: 43851/43851 (100.00%)
4 changes: 2 additions & 2 deletions tasks/coverage/snapshots/minifier_test262.snap
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
commit: 06454619

minifier_test262 Summary:
AST Parsed : 43832/43832 (100.00%)
Positive Passed: 43832/43832 (100.00%)
AST Parsed : 43851/43851 (100.00%)
Positive Passed: 43851/43851 (100.00%)
Loading
Loading