From 5a73a663dc2b5f22029a0c2737315ea6ef8be345 Mon Sep 17 00:00:00 2001 From: leaysgur <6259812+leaysgur@users.noreply.github.com> Date: Thu, 3 Oct 2024 02:47:08 +0000 Subject: [PATCH] refactor(regular_expression)!: Simplify public APIs (#6262) This PR makes 2 changes to improve the existing API that are not very useful. - Remove `(Literal)Parser` and `FlagsParser` and their ASTs - Add `with_flags(flags_text)` helper to `ParserOptions` Here are the details. > Remove `(Literal)Parser` and `FlagsParser` and their ASTs Previously, the `oxc_regular_expression` crate exposed 3 parsers. - `(Literal)Parser`: assumes `/pattern/flags` format - `PatternParser`: assumes `pattern` part only - `FlagsParser`: assumes `flags` part only However, it turns out that in actual usecases, only the `PatternParser` is actually sufficient, as the pattern and flags are validated and sliced in advance on the `oxc_parser` side. The current usecase for `(Literal)Parser` is mostly for internal testing. There were also some misuses of `(Literal)Parser` that restore `format!("/{pattern}/{flags}")` back and use `(Literal)Parser`. Therefore, only `PatternParser` is now published, and unnecessary ASTs have been removed. (This also obsoletes #5592 .) > Added `with_flags(flags_text)` helper to `ParserOptions` Strictly speaking, there was a subtle difference between the "flag" strings that users were aware of and the "mode" recognised by the parser. Therefore, it was a common mistake to forget to enable `unicode_mode` when using the `v` flag. With this helper, crate users no longer need to distinguish between flags and modes. --- .../oxc_ast/src/generated/assert_layouts.rs | 36 --- .../src/rules/eslint/no_control_regex.rs | 49 ++- .../src/rules/eslint/no_invalid_regexp.rs | 105 +++---- .../src/rules/eslint/no_regex_spaces.rs | 12 +- .../src/snapshots/no_invalid_regexp.snap | 26 +- .../oxc_parser/examples/regular_expression.rs | 21 +- crates/oxc_parser/src/js/expression.rs | 18 +- .../examples/parse_literal.rs | 91 +++--- .../examples/regex_visitor.rs | 6 +- crates/oxc_regular_expression/src/ast.rs | 26 -- .../src/ast_impl/display.rs | 64 ++-- .../src/body_parser/mod.rs | 278 ------------------ .../oxc_regular_expression/src/diagnostics.rs | 37 --- .../src/flags_parser.rs | 68 ----- .../src/generated/derive_clone_in.rs | 28 -- .../src/generated/derive_content_eq.rs | 20 -- .../src/generated/derive_content_hash.rs | 20 -- crates/oxc_regular_expression/src/lib.rs | 10 +- .../src/literal_parser.rs | 165 ----------- crates/oxc_regular_expression/src/options.rs | 21 +- .../oxc_regular_expression/src/parser/mod.rs | 266 +++++++++++++++++ .../parser.rs => parser/parser_impl.rs} | 16 +- .../src/{body_parser => parser}/reader.rs | 0 .../src/{ => parser}/span_factory.rs | 0 .../src/{body_parser => parser}/state.rs | 2 +- .../src/{body_parser => parser}/unicode.rs | 0 .../unicode_property.rs | 0 crates/oxc_transformer/src/regexp/mod.rs | 14 +- tasks/coverage/src/driver.rs | 17 +- 29 files changed, 476 insertions(+), 940 deletions(-) delete mode 100644 crates/oxc_regular_expression/src/body_parser/mod.rs delete mode 100644 crates/oxc_regular_expression/src/flags_parser.rs delete mode 100644 crates/oxc_regular_expression/src/literal_parser.rs create mode 100644 crates/oxc_regular_expression/src/parser/mod.rs rename crates/oxc_regular_expression/src/{body_parser/parser.rs => parser/parser_impl.rs} (99%) rename crates/oxc_regular_expression/src/{body_parser => parser}/reader.rs (100%) rename crates/oxc_regular_expression/src/{ => parser}/span_factory.rs (100%) rename crates/oxc_regular_expression/src/{body_parser => parser}/state.rs (99%) rename crates/oxc_regular_expression/src/{body_parser => parser}/unicode.rs (100%) rename crates/oxc_regular_expression/src/{body_parser => parser}/unicode_property.rs (100%) diff --git a/crates/oxc_ast/src/generated/assert_layouts.rs b/crates/oxc_ast/src/generated/assert_layouts.rs index 340635355be01..f5a4a6d88b0ec 100644 --- a/crates/oxc_ast/src/generated/assert_layouts.rs +++ b/crates/oxc_ast/src/generated/assert_layouts.rs @@ -1412,24 +1412,6 @@ const _: () = { assert!(size_of::() == 1usize); assert!(align_of::() == 1usize); - assert!(size_of::() == 72usize); - assert!(align_of::() == 8usize); - assert!(offset_of!(RegularExpression, span) == 0usize); - assert!(offset_of!(RegularExpression, pattern) == 8usize); - assert!(offset_of!(RegularExpression, flags) == 56usize); - - assert!(size_of::() == 16usize); - assert!(align_of::() == 4usize); - assert!(offset_of!(Flags, span) == 0usize); - assert!(offset_of!(Flags, global) == 8usize); - assert!(offset_of!(Flags, ignore_case) == 9usize); - assert!(offset_of!(Flags, multiline) == 10usize); - assert!(offset_of!(Flags, unicode) == 11usize); - assert!(offset_of!(Flags, sticky) == 12usize); - assert!(offset_of!(Flags, dot_all) == 13usize); - assert!(offset_of!(Flags, has_indices) == 14usize); - assert!(offset_of!(Flags, unicode_sets) == 15usize); - assert!(size_of::() == 48usize); assert!(align_of::() == 8usize); assert!(offset_of!(Pattern, span) == 0usize); @@ -2967,24 +2949,6 @@ const _: () = { assert!(size_of::() == 1usize); assert!(align_of::() == 1usize); - assert!(size_of::() == 56usize); - assert!(align_of::() == 4usize); - assert!(offset_of!(RegularExpression, span) == 0usize); - assert!(offset_of!(RegularExpression, pattern) == 8usize); - assert!(offset_of!(RegularExpression, flags) == 40usize); - - assert!(size_of::() == 16usize); - assert!(align_of::() == 4usize); - assert!(offset_of!(Flags, span) == 0usize); - assert!(offset_of!(Flags, global) == 8usize); - assert!(offset_of!(Flags, ignore_case) == 9usize); - assert!(offset_of!(Flags, multiline) == 10usize); - assert!(offset_of!(Flags, unicode) == 11usize); - assert!(offset_of!(Flags, sticky) == 12usize); - assert!(offset_of!(Flags, dot_all) == 13usize); - assert!(offset_of!(Flags, has_indices) == 14usize); - assert!(offset_of!(Flags, unicode_sets) == 15usize); - assert!(size_of::() == 32usize); assert!(align_of::() == 4usize); assert!(offset_of!(Pattern, span) == 0usize); diff --git a/crates/oxc_linter/src/rules/eslint/no_control_regex.rs b/crates/oxc_linter/src/rules/eslint/no_control_regex.rs index dc4d6e95f1fec..b5de21788c721 100644 --- a/crates/oxc_linter/src/rules/eslint/no_control_regex.rs +++ b/crates/oxc_linter/src/rules/eslint/no_control_regex.rs @@ -1,8 +1,5 @@ use oxc_allocator::Allocator; -use oxc_ast::{ - ast::{Argument, RegExpFlags}, - AstKind, -}; +use oxc_ast::{ast::Argument, AstKind}; use oxc_diagnostics::OxcDiagnostic; use oxc_macros::declare_oxc_lint; use oxc_regular_expression::{ @@ -92,25 +89,19 @@ impl Rule for NoControlRegex { // get pattern from arguments. Missing or non-string arguments // will be runtime errors, but are not covered by this rule. let alloc = Allocator::default(); - let pattern_with_slashes = format!("/{}/", &pattern.value); let flags = extract_regex_flags(&expr.arguments); + let flags_text = flags.map_or(String::new(), |f| f.to_string()); let parser = Parser::new( &alloc, - pattern_with_slashes.as_str(), - ParserOptions { - span_offset: expr - .arguments - .first() - .map_or(0, |arg| arg.span().start), - unicode_mode: flags - .is_some_and(|flags| flags.contains(RegExpFlags::U)), - unicode_sets_mode: flags - .is_some_and(|flags| flags.contains(RegExpFlags::V)), - }, + pattern.value.as_str(), + ParserOptions::default() + .with_span_offset( + expr.arguments.first().map_or(0, |arg| arg.span().start), + ) + .with_flags(&flags_text), ); - let Some(pattern) = parser.parse().ok().map(|pattern| pattern.pattern) - else { + let Ok(pattern) = parser.parse() else { return; }; @@ -133,25 +124,19 @@ impl Rule for NoControlRegex { // get pattern from arguments. Missing or non-string arguments // will be runtime errors, but are not covered by this rule. let alloc = Allocator::default(); - let pattern_with_slashes = format!("/{}/", &pattern.value); let flags = extract_regex_flags(&expr.arguments); + let flags_text = flags.map_or(String::new(), |f| f.to_string()); let parser = Parser::new( &alloc, - pattern_with_slashes.as_str(), - ParserOptions { - span_offset: expr - .arguments - .first() - .map_or(0, |arg| arg.span().start), - unicode_mode: flags - .is_some_and(|flags| flags.contains(RegExpFlags::U)), - unicode_sets_mode: flags - .is_some_and(|flags| flags.contains(RegExpFlags::V)), - }, + pattern.value.as_str(), + ParserOptions::default() + .with_span_offset( + expr.arguments.first().map_or(0, |arg| arg.span().start), + ) + .with_flags(&flags_text), ); - let Some(pattern) = parser.parse().ok().map(|pattern| pattern.pattern) - else { + let Ok(pattern) = parser.parse() else { return; }; diff --git a/crates/oxc_linter/src/rules/eslint/no_invalid_regexp.rs b/crates/oxc_linter/src/rules/eslint/no_invalid_regexp.rs index 762fa3d529ac3..9b1a5341f0ec0 100644 --- a/crates/oxc_linter/src/rules/eslint/no_invalid_regexp.rs +++ b/crates/oxc_linter/src/rules/eslint/no_invalid_regexp.rs @@ -1,14 +1,28 @@ use oxc_allocator::Allocator; use oxc_ast::{ast::Argument, AstKind}; -use oxc_diagnostics::{LabeledSpan, OxcDiagnostic}; +use oxc_diagnostics::OxcDiagnostic; use oxc_macros::declare_oxc_lint; -use oxc_regular_expression::{FlagsParser, ParserOptions, PatternParser}; +use oxc_regular_expression::{Parser, ParserOptions}; use oxc_span::Span; use rustc_hash::FxHashSet; use serde::Deserialize; use crate::{context::LintContext, rule::Rule, AstNode}; +// Use the same prefix with `oxc_regular_expression` crate +fn duplicated_flag_diagnostic(span: Span) -> OxcDiagnostic { + OxcDiagnostic::warn("Invalid regular expression: Duplicated flag").with_label(span) +} + +fn unknown_flag_diagnostic(span: Span) -> OxcDiagnostic { + OxcDiagnostic::warn("Invalid regular expression: Unknown flag").with_label(span) +} + +fn invalid_unicode_flags_diagnostic(span: Span) -> OxcDiagnostic { + OxcDiagnostic::warn("Invalid regular expression: `u` and `v` flags should be used alone") + .with_label(span) +} + #[derive(Debug, Default, Clone)] pub struct NoInvalidRegexp(Box); @@ -72,75 +86,54 @@ impl Rule for NoInvalidRegexp { return; } - let allocator = Allocator::default(); - // Validate flags first if exists - let mut parsed_flags = None; if let Some((flags_span_start, flags_text)) = flags_arg { - // Check for duplicated flags - // For compatibility with ESLint, we need to check "user-defined duplicated" flags here - // "valid duplicated" flags are also checked + let (mut u_flag_found, mut v_flag_found) = (false, false); let mut unique_flags = FxHashSet::default(); - let mut violations = vec![]; for (idx, ch) in flags_text.char_indices() { - if !unique_flags.insert(ch) { - violations.push(idx); - } - } - if !violations.is_empty() { - return ctx.diagnostic( - // Use the same prefix with `oxc_regular_expression` - OxcDiagnostic::warn("Invalid regular expression: Duplicated flag").with_labels( - violations - .iter() - .map(|&start| { - #[allow(clippy::cast_possible_truncation)] - let start = flags_span_start + start as u32; - LabeledSpan::new_with_span(None, Span::new(start, start)) - }) - .collect::>(), - ), - ); - } + #[allow(clippy::cast_possible_truncation)] + let start = flags_span_start + idx as u32; - // Omit user defined invalid flags - for flag in &self.0.allow_constructor_flags { - match flag { - // Keep valid flags, even if they are defined - 'd' | 'g' | 'i' | 'm' | 's' | 'u' | 'v' | 'y' => continue, - _ => { - unique_flags.remove(flag); + // Invalid combination: u+v + if ch == 'u' { + if v_flag_found { + return ctx + .diagnostic(invalid_unicode_flags_diagnostic(Span::new(start, start))); } + u_flag_found = true; + } + if ch == 'v' { + if u_flag_found { + return ctx + .diagnostic(invalid_unicode_flags_diagnostic(Span::new(start, start))); + } + v_flag_found = true; + } + + // Duplicated: user defined, invalid or valid + if !unique_flags.insert(ch) { + return ctx.diagnostic(duplicated_flag_diagnostic(Span::new(start, start))); } - } - // Use parser to check: - // - Unknown invalid flags - // - Invalid flags combination: u+v - // - (Valid duplicated flags are already checked above) - // It can be done without `FlagsParser`, though - let flags_text = unique_flags.iter().collect::(); - let options = ParserOptions::default().with_span_offset(flags_span_start); - match FlagsParser::new(&allocator, flags_text.as_str(), options).parse() { - Ok(flags) => parsed_flags = Some(flags), - Err(diagnostic) => return ctx.diagnostic(diagnostic), + // Unknown: not valid, not user defined + if !(matches!(ch, 'd' | 'g' | 'i' | 'm' | 's' | 'u' | 'v' | 'y') + || self.0.allow_constructor_flags.contains(&ch)) + { + return ctx.diagnostic(unknown_flag_diagnostic(Span::new(start, start))); + } } } // Then, validate pattern if exists // Pattern check is skipped when 1st argument is NOT a `StringLiteral` // e.g. `new RegExp(var)`, `RegExp("str" + var)` + let allocator = Allocator::default(); if let Some((pattern_span_start, pattern_text)) = pattern_arg { - let mut options = ParserOptions::default().with_span_offset(pattern_span_start); - if let Some(flags) = parsed_flags { - if flags.unicode || flags.unicode_sets { - options = options.with_unicode_mode(); - } - if flags.unicode_sets { - options = options.with_unicode_sets_mode(); - } - } - match PatternParser::new(&allocator, pattern_text, options).parse() { + let options = ParserOptions::default() + .with_span_offset(pattern_span_start) + .with_flags(flags_arg.map_or("", |(_, flags_text)| flags_text)); + + match Parser::new(&allocator, pattern_text, options).parse() { Ok(_) => {} Err(diagnostic) => ctx.diagnostic(diagnostic), } diff --git a/crates/oxc_linter/src/rules/eslint/no_regex_spaces.rs b/crates/oxc_linter/src/rules/eslint/no_regex_spaces.rs index dabb4961f0395..650b6828e3b1b 100644 --- a/crates/oxc_linter/src/rules/eslint/no_regex_spaces.rs +++ b/crates/oxc_linter/src/rules/eslint/no_regex_spaces.rs @@ -105,12 +105,14 @@ impl NoRegexSpaces { } let alloc = Allocator::default(); - let pattern_with_slashes = format!("/{}/", &pattern.value); - let parser = Parser::new(&alloc, pattern_with_slashes.as_str(), ParserOptions::default()); - let regex = parser.parse().ok()?; + let parser = Parser::new( + &alloc, + pattern.value.as_str(), + ParserOptions::default().with_span_offset(pattern.span.start + 1), + ); + let parsed_pattern = parser.parse().ok()?; - find_consecutive_spaces(®ex.pattern) - .map(|span| Span::new(span.start + pattern.span.start, span.end + pattern.span.start)) + find_consecutive_spaces(&parsed_pattern) } fn is_regexp_new_expression(expr: &NewExpression<'_>) -> bool { diff --git a/crates/oxc_linter/src/snapshots/no_invalid_regexp.snap b/crates/oxc_linter/src/snapshots/no_invalid_regexp.snap index 506cc161156c9..d4618cde7bc51 100644 --- a/crates/oxc_linter/src/snapshots/no_invalid_regexp.snap +++ b/crates/oxc_linter/src/snapshots/no_invalid_regexp.snap @@ -62,9 +62,9 @@ source: crates/oxc_linter/src/tester.rs ╰──── ⚠ eslint(no-invalid-regexp): Invalid regular expression: Unknown flag - ╭─[no_invalid_regexp.tsx:1:18] + ╭─[no_invalid_regexp.tsx:1:19] 1 │ new RegExp('.', 'aA'); - · ▲ + · ▲ ╰──── ⚠ eslint(no-invalid-regexp): Invalid regular expression: Duplicated flag @@ -91,10 +91,10 @@ source: crates/oxc_linter/src/tester.rs · ▲ ╰──── - ⚠ eslint(no-invalid-regexp): Invalid regular expression: Duplicated flag - ╭─[no_invalid_regexp.tsx:1:20] + ⚠ eslint(no-invalid-regexp): Invalid regular expression: Unknown flag + ╭─[no_invalid_regexp.tsx:1:18] 1 │ new RegExp('.', 'ouo'); - · ▲ + · ▲ ╰──── ⚠ eslint(no-invalid-regexp): Invalid regular expression: Could not parse the entire pattern @@ -164,9 +164,9 @@ source: crates/oxc_linter/src/tester.rs ╰──── ⚠ eslint(no-invalid-regexp): Invalid regular expression: Unknown flag - ╭─[no_invalid_regexp.tsx:1:22] + ╭─[no_invalid_regexp.tsx:1:23] 1 │ new RegExp(pattern, 'az'); - · ▲ + · ▲ ╰──── ⚠ eslint(no-invalid-regexp): Invalid regular expression: Unterminated character class @@ -175,16 +175,16 @@ source: crates/oxc_linter/src/tester.rs · ─── ╰──── - ⚠ eslint(no-invalid-regexp): Invalid regular expression: Invalid flags, `u` and `v` should be used alone - ╭─[no_invalid_regexp.tsx:1:18] + ⚠ eslint(no-invalid-regexp): Invalid regular expression: `u` and `v` flags should be used alone + ╭─[no_invalid_regexp.tsx:1:19] 1 │ new RegExp('.', 'uv'); - · ── + · ▲ ╰──── - ⚠ eslint(no-invalid-regexp): Invalid regular expression: Invalid flags, `u` and `v` should be used alone - ╭─[no_invalid_regexp.tsx:1:22] + ⚠ eslint(no-invalid-regexp): Invalid regular expression: `u` and `v` flags should be used alone + ╭─[no_invalid_regexp.tsx:1:23] 1 │ new RegExp(pattern, 'uv'); - · ── + · ▲ ╰──── ⚠ eslint(no-invalid-regexp): Invalid regular expression: Character class atom range out of order diff --git a/crates/oxc_parser/examples/regular_expression.rs b/crates/oxc_parser/examples/regular_expression.rs index a26225afb24cc..2ac496d5da120 100644 --- a/crates/oxc_parser/examples/regular_expression.rs +++ b/crates/oxc_parser/examples/regular_expression.rs @@ -4,7 +4,7 @@ use std::{env, fs, path::Path, sync::Arc}; use oxc_allocator::Allocator; use oxc_ast::{ast, AstKind, Visit}; use oxc_parser::{ParseOptions, Parser}; -use oxc_regular_expression::{FlagsParser, ParserOptions, PatternParser}; +use oxc_regular_expression::{Parser as RegExpParser, ParserOptions as RegExpParserOptions}; use oxc_span::SourceType; // `cargo run -p oxc_parser --example regular_expression` @@ -45,6 +45,7 @@ struct RegularExpressionVisitor { impl<'a> Visit<'a> for RegularExpressionVisitor { fn enter_node(&mut self, kind: AstKind<'a>) { let allocator = Allocator::default(); + match kind { AstKind::RegExpLiteral(re) => { println!("🍀 {}", re.span.source_text(self.source_text.as_ref())); @@ -61,12 +62,12 @@ impl<'a> Visit<'a> for RegularExpressionVisitor { { println!("🍀 {}", new_expr.span.source_text(&self.source_text)); - let pattern = match new_expr.arguments.first() { - Some(ast::Argument::StringLiteral(sl)) => &sl.value, + let (pattern, pattern_span) = match new_expr.arguments.first() { + Some(ast::Argument::StringLiteral(sl)) => (&sl.value, &sl.span), Some(ast::Argument::TemplateLiteral(tl)) if tl.is_no_substitution_template() => { - &tl.quasi().unwrap() + (&tl.quasi().unwrap(), &tl.span) } _ => return, }; @@ -81,16 +82,12 @@ impl<'a> Visit<'a> for RegularExpressionVisitor { _ => "", }; - let flags = - FlagsParser::new(&allocator, flags, ParserOptions::default()).parse().unwrap(); - let parsed = PatternParser::new( + let parsed = RegExpParser::new( &allocator, pattern, - ParserOptions { - span_offset: new_expr.span.start + 12, // = "new RegExp(\"".len() - unicode_mode: flags.unicode || flags.unicode_sets, - unicode_sets_mode: flags.unicode_sets, - }, + RegExpParserOptions::default() + .with_span_offset(pattern_span.start + 1) + .with_flags(flags), ) .parse(); diff --git a/crates/oxc_parser/src/js/expression.rs b/crates/oxc_parser/src/js/expression.rs index ca883f683ed1d..3ae3be06e6f5f 100644 --- a/crates/oxc_parser/src/js/expression.rs +++ b/crates/oxc_parser/src/js/expression.rs @@ -340,14 +340,16 @@ impl<'a> ParserImpl<'a> { let span = self.start_span(); // split out pattern let (pattern_end, flags) = self.read_regex()?; - let pattern_start = self.cur_token().start + 1; // +1 to exclude `/` + let pattern_start = self.cur_token().start + 1; // +1 to exclude left `/` let pattern_text = &self.source_text[pattern_start as usize..pattern_end as usize]; + let flags_start = pattern_end + 1; // +1 to include right `/` + let flags_text = &self.source_text[flags_start as usize..self.cur_token().end as usize]; self.bump_any(); let pattern = self .options .parse_regular_expression .then_some(()) - .map(|()| self.parse_regex_pattern(pattern_start, pattern_text, flags)) + .map(|()| self.parse_regex_pattern(pattern_start, pattern_text, flags_text)) .map_or_else( || RegExpPattern::Raw(pattern_text), |pat| { @@ -361,15 +363,11 @@ impl<'a> ParserImpl<'a> { &mut self, span_offset: u32, pattern: &'a str, - flags: RegExpFlags, + flags: &'a str, ) -> Option>> { - use oxc_regular_expression::{ParserOptions, PatternParser}; - let options = ParserOptions { - span_offset, - unicode_mode: flags.contains(RegExpFlags::U) || flags.contains(RegExpFlags::V), - unicode_sets_mode: flags.contains(RegExpFlags::V), - }; - match PatternParser::new(self.ast.allocator, pattern, options).parse() { + use oxc_regular_expression::{Parser, ParserOptions}; + let options = ParserOptions::default().with_span_offset(span_offset).with_flags(flags); + match Parser::new(self.ast.allocator, pattern, options).parse() { Ok(regular_expression) => Some(self.ast.alloc(regular_expression)), Err(diagnostic) => { self.error(diagnostic); diff --git a/crates/oxc_regular_expression/examples/parse_literal.rs b/crates/oxc_regular_expression/examples/parse_literal.rs index 26b50b9146415..687faebfccac3 100644 --- a/crates/oxc_regular_expression/examples/parse_literal.rs +++ b/crates/oxc_regular_expression/examples/parse_literal.rs @@ -1,60 +1,63 @@ #![allow(clippy::print_stdout)] use oxc_allocator::Allocator; -use oxc_regular_expression::{ast, Parser, ParserOptions}; +use oxc_regular_expression::{Parser, ParserOptions}; fn main() { let allocator = Allocator::default(); - for source_text in [ - "/ab/", - "/abc/i", - "/abcd/igv", - "/emo👈🏻ji/u", - "/ab|c/i", - "/a|b+|c/i", - "/a{0}|b{1,2}|c{3,}/i", - "/(?=a)|(?<=b)|(?!c)|(?x\1c/u", - r"/(cg)(?cg)(?:g)/", - r"/{3}/", // Error - r"/Em🥹j/", - r"/^(?=ab)\b(?!cd)(?<=ef)\B(?)(?)/", // Error - r"/(?noname)/v", // Error - r"/[\bb]/", + for (pattern, flags) in [ + (r"ab", ""), + (r"abc", "i"), + (r"abcd", "igv"), + (r"emo👈🏻ji", "u"), + (r"ab|c", "i"), + (r"a|b+|c", "i"), + (r"a{0}|b{1,2}|c{3,}", "i"), + (r"(?=a)|(?<=b)|(?!c)|(?x\1c", "u"), + (r"(cg)(?cg)(?:g)", ""), + (r"{3}", ""), // Error + (r"Em🥹j", ""), + (r"^(?=ab)\b(?!cd)(?<=ef)\B(?)(?)", ""), // Error + (r"(?noname)", "v"), // Error + (r"[\bb]", ""), + (r"a{2,1}", "v"), // Error ] { - println!("Parse: {source_text}"); - let parser = Parser::new(&allocator, source_text, ParserOptions::default()); + let parser = Parser::new( + &allocator, + pattern, + ParserOptions::default().with_span_offset(1).with_flags(flags), + ); let ret = parser.parse(); + let literal = format!("/{pattern}/{flags}"); + println!("Parse: {literal}"); match ret { - Ok(ast::RegularExpression { pattern, flags, .. }) => { - println!("✨ {}", pattern.span.source_text(source_text)); - println!("{pattern:#?}"); - println!("✨ {}", flags.span.source_text(source_text)); - println!("{flags:?}"); + Ok(pattern) => { + println!("✨ {pattern:#?}"); } Err(error) => { - let error = error.with_source_code(source_text); + let error = error.with_source_code(literal); println!("💥 {error:?}"); } } diff --git a/crates/oxc_regular_expression/examples/regex_visitor.rs b/crates/oxc_regular_expression/examples/regex_visitor.rs index f03813c94c2d4..154db705b838b 100644 --- a/crates/oxc_regular_expression/examples/regex_visitor.rs +++ b/crates/oxc_regular_expression/examples/regex_visitor.rs @@ -20,10 +20,12 @@ impl Visit<'_> for TestVisitor { } fn main() { - let source_text = r"/(https?:\/\/github\.com\/(([^\s]+)\/([^\s]+))\/([^\s]+\/)?(issues|pull)\/([0-9]+))|(([^\s]+)\/([^\s]+))?#([1-9][0-9]*)($|[\s\:\;\-\(\=])/"; + let source_text = r"(https?:\/\/github\.com\/(([^\s]+)\/([^\s]+))\/([^\s]+\/)?(issues|pull)\/([0-9]+))|(([^\s]+)\/([^\s]+))?#([1-9][0-9]*)($|[\s\:\;\-\(\=])"; + let allocator = Allocator::default(); let parser = Parser::new(&allocator, source_text, ParserOptions::default()); - let pattern = parser.parse().unwrap().pattern; + let pattern = parser.parse().unwrap(); + let mut visitor = TestVisitor; visitor.visit_pattern(&pattern); } diff --git a/crates/oxc_regular_expression/src/ast.rs b/crates/oxc_regular_expression/src/ast.rs index 3644246113889..f843ed2b9fa96 100644 --- a/crates/oxc_regular_expression/src/ast.rs +++ b/crates/oxc_regular_expression/src/ast.rs @@ -9,32 +9,6 @@ use serde::Serialize; #[cfg(feature = "serialize")] use tsify::Tsify; -#[ast] -#[derive(Debug)] -#[generate_derive(CloneIn, ContentEq, ContentHash)] -#[cfg_attr(feature = "serialize", derive(Serialize, Tsify))] -pub struct RegularExpression<'a> { - pub span: Span, - pub pattern: Pattern<'a>, - pub flags: Flags, -} - -#[ast] -#[derive(Debug, Clone)] -#[generate_derive(CloneIn, ContentEq, ContentHash)] -#[cfg_attr(feature = "serialize", derive(Serialize, Tsify))] -pub struct Flags { - pub span: Span, - pub global: bool, - pub ignore_case: bool, - pub multiline: bool, - pub unicode: bool, - pub sticky: bool, - pub dot_all: bool, - pub has_indices: bool, - pub unicode_sets: bool, -} - /// The root of the `PatternParser` result. #[ast] #[derive(Debug)] diff --git a/crates/oxc_regular_expression/src/ast_impl/display.rs b/crates/oxc_regular_expression/src/ast_impl/display.rs index 939de35b3be31..4e90b689de726 100644 --- a/crates/oxc_regular_expression/src/ast_impl/display.rs +++ b/crates/oxc_regular_expression/src/ast_impl/display.rs @@ -7,37 +7,6 @@ use std::{ use crate::ast::*; use crate::surrogate_pair::{combine_surrogate_pair, is_lead_surrogate, is_trail_surrogate}; -impl<'a> Display for RegularExpression<'a> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "/{}/{}", self.pattern, self.flags) - } -} - -impl Display for Flags { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let mut flags = String::with_capacity(8); - - // write flags in the order they are described in the `MDN` - // - for (v, ch) in [ - (self.has_indices, 'd'), - (self.global, 'g'), - (self.ignore_case, 'i'), - (self.multiline, 'm'), - (self.dot_all, 's'), - (self.unicode, 'u'), - (self.unicode_sets, 'v'), - (self.sticky, 'y'), - ] { - if v { - flags.push(ch); - } - } - - write!(f, "{flags}") - } -} - impl<'a> Display for Pattern<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.body) @@ -513,10 +482,8 @@ mod test { (r"/\t\n\v\f\r/u", None), (r"/\p{L}/u", None), (r"/\d/g", None), - // Lose the flags ordering -- - ("/abcd/igv", Some("/abcd/giv")), - (r"/\d/ug", Some(r"/\d/gu")), - // -- + ("/abcd/igv", Some("/abcd/igv")), + (r"/\d/ug", Some(r"/\d/ug")), // we capitalize hex unicodes. (r"/\n\cM\0\x41\u{1f600}\./u", Some(r"/\n\cM\0\x41\u{1F600}\./u")), (r"/\u02c1/u", Some(r"/\u02C1/u")), @@ -577,15 +544,26 @@ mod test { (r"/([\-a-z]{0,31})/iu", None), ]; - fn test_display(allocator: &Allocator, (source, expect): &Case) { - let expect = expect.unwrap_or(source); - let actual = Parser::new(allocator, source, ParserOptions::default()).parse().unwrap(); - assert_eq!(expect, actual.to_string()); - } - #[test] - fn test() { + fn test_display() { let allocator = &Allocator::default(); - CASES.iter().for_each(|case| test_display(allocator, case)); + + for (input, output) in CASES { + let (left_slash, right_slash) = (input.find('/').unwrap(), input.rfind('/').unwrap()); + + let pattern = &input[left_slash + 1..right_slash]; + let flags = &input[right_slash + 1..]; + + let actual = Parser::new( + allocator, + pattern, + ParserOptions::default().with_span_offset(1).with_flags(flags), + ) + .parse() + .unwrap(); + + let expect = output.unwrap_or(input); + assert_eq!(expect, format!("/{actual}/{flags}")); // This uses `Display` impls + } } } diff --git a/crates/oxc_regular_expression/src/body_parser/mod.rs b/crates/oxc_regular_expression/src/body_parser/mod.rs deleted file mode 100644 index 0be06b888c624..0000000000000 --- a/crates/oxc_regular_expression/src/body_parser/mod.rs +++ /dev/null @@ -1,278 +0,0 @@ -mod parser; -mod reader; -mod state; -mod unicode; -mod unicode_property; - -pub use parser::PatternParser; - -#[cfg(test)] -mod test { - use oxc_allocator::Allocator; - - use crate::{ParserOptions, PatternParser}; - - #[test] - fn should_pass() { - let allocator = Allocator::default(); - - for (source_text, options) in &[ - ("", ParserOptions::default()), - ("a", ParserOptions::default()), - ("a+", ParserOptions::default()), - ("a*", ParserOptions::default()), - ("a?", ParserOptions::default()), - ("^$^$^$", ParserOptions::default()), - ("(?=a){1}", ParserOptions::default()), - ("(?!a){1}", ParserOptions::default()), - ("a{1}", ParserOptions::default()), - ("a{1", ParserOptions::default()), - ("a|{", ParserOptions::default()), - ("a{", ParserOptions::default()), - ("a{,", ParserOptions::default()), - ("a{1,", ParserOptions::default()), - ("a{1,}", ParserOptions::default()), - ("a{1,2}", ParserOptions::default()), - ("x{9007199254740991}", ParserOptions::default()), - ("x{9007199254740991,9007199254740991}", ParserOptions::default()), - ("a|b", ParserOptions::default()), - ("a|b|c", ParserOptions::default()), - ("a|b+?|c", ParserOptions::default()), - ("a+b*?c{1}d{2,}e{3,4}?", ParserOptions::default()), - (r"^(?=ab)\b(?!cd)(?<=ef)\B(?.)\x1f", ParserOptions::default()), - ("a]", ParserOptions::default()), - ("a}", ParserOptions::default()), - ("]", ParserOptions::default()), - ("[]", ParserOptions::default()), - ("[a]", ParserOptions::default()), - ("[ab]", ParserOptions::default()), - ("[a-b]", ParserOptions::default()), - ("[-]", ParserOptions::default()), - ("[a-]", ParserOptions::default()), - ("[-a]", ParserOptions::default()), - ("[-a-]", ParserOptions::default()), - (r"[a\-b]", ParserOptions::default()), - (r"[-a-b]", ParserOptions::default()), - (r"[a-b-]", ParserOptions::default()), - (r"[a\-b-]", ParserOptions::default()), - (r"[\[\]\-]", ParserOptions::default()), - ("[a-z0-9]", ParserOptions::default()), - ("[a-a]", ParserOptions::default()), - (r"[\d-\D]", ParserOptions::default()), - (r"^([\ud801[\udc28-\udc4f])$", ParserOptions::default()), - (r"[a-c]]", ParserOptions::default()), - ( - r"[ϗϙϛϝϟϡϣϥϧϩϫϭϯ-ϳϵϸϻ-ϼа-џѡѣѥѧѩѫѭѯѱѳѵѷѹѻѽѿҁҋҍҏґғҕҗҙқҝҟҡңҥҧҩҫҭүұҳҵҷҹһҽҿӂӄӆӈӊӌӎ-ӏӑӓӕӗәӛӝӟӡӣӥӧөӫӭӯӱӳӵӷӹӻӽӿԁԃԅԇԉԋԍԏԑԓԕԗԙԛԝԟԡԣա-ևᴀ-ᴫᵢ-ᵷᵹ-ᶚḁḃḅḇḉḋḍḏḑḓḕḗḙḛḝḟḡḣḥḧḩḫḭḯḱḳḵḷḹḻḽḿṁṃṅṇṉṋṍṏṑṓṕṗṙṛṝṟṡṣṥṧṩṫṭṯṱṳṵṷṹṻṽṿẁẃẅẇẉẋẍẏẑẓẕ-ẝẟạảấầẩẫậắằẳẵặẹẻẽếềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹỻỽỿ-ἇἐ-ἕἠ-ἧἰ-ἷὀ-ὅὐ-ὗὠ-ὧὰ]", - ParserOptions::default(), - ), - (r"[a-z0-9[.\\]]", ParserOptions::default().with_unicode_sets_mode()), - (r"[a&&b&&c]", ParserOptions::default().with_unicode_sets_mode()), - (r"[a--b--c]", ParserOptions::default().with_unicode_sets_mode()), - (r"[[a-z]--b--c]", ParserOptions::default().with_unicode_sets_mode()), - ( - r"[[[[[[[[[[[[[[[[[[[[[[[[a]]]]]]]]]]]]]]]]]]]]]]]]", - ParserOptions::default().with_unicode_sets_mode(), - ), - ( - r"[\q{}\q{a}\q{bc}\q{d|e|f}\q{|||}]", - ParserOptions::default().with_unicode_sets_mode(), - ), - (r"(?A)\k", ParserOptions::default()), - (r"(?)\k", ParserOptions::default()), - (r"\k", ParserOptions::default()), - (r"\k<4>", ParserOptions::default()), - (r"\k", ParserOptions::default()), - (r"(?)\k", ParserOptions::default()), - (r"(?)\k", ParserOptions::default().with_unicode_mode()), - (r"\1", ParserOptions::default()), - (r"\1()", ParserOptions::default()), - (r"\1()", ParserOptions::default().with_unicode_mode()), - (r"(?..)(?..)", ParserOptions::default()), - // TODO: ES2025 Duplicate named capturing groups - // (r"(?..)|(?..)", ParserOptions::default()), - // (r"(?[0-9]{4})-[0-9]{2}|[0-9]{2}-(?[0-9]{4})", ParserOptions::default()), - // (r"(?:(?x)|(?y))\k", ParserOptions::default()), - ] { - let res = PatternParser::new(&allocator, source_text, *options).parse(); - if let Err(err) = res { - panic!("Failed to parse {source_text} with {options:?}\n💥 {err}"); - } - } - } - - #[test] - fn should_fail() { - let allocator = Allocator::default(); - - for (source_text, options) in &[ - ("a)", ParserOptions::default()), - (r"a\", ParserOptions::default()), - ("a]", ParserOptions::default().with_unicode_mode()), - ("a}", ParserOptions::default().with_unicode_mode()), - ("a|+", ParserOptions::default()), - ("a|{", ParserOptions::default().with_unicode_mode()), - ("a{", ParserOptions::default().with_unicode_mode()), - ("a{1", ParserOptions::default().with_unicode_mode()), - ("a{1,", ParserOptions::default().with_unicode_mode()), - ("a{,", ParserOptions::default().with_unicode_mode()), - ("x{9007199254740992}", ParserOptions::default()), - ("x{9007199254740991,9007199254740992}", ParserOptions::default()), - ("x{99999999999999999999999999999999999999999999999999}", ParserOptions::default()), - (r"\99999999999999999999999999999999999999999999999999", ParserOptions::default()), - (r"\u{FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF}", ParserOptions::default().with_unicode_mode()), - ("(?=a", ParserOptions::default()), - ("(?", ParserOptions::default().with_unicode_mode()), - (r"\k<4>", ParserOptions::default().with_unicode_mode()), - (r"\k", ParserOptions::default().with_unicode_mode()), - ("a(?:", ParserOptions::default()), - ("(a", ParserOptions::default()), - ("(?", ParserOptions::default()), - (r"(?.)", ParserOptions::default()), - (r"(?.)", ParserOptions::default().with_unicode_mode()), - (r"(?<\>.)", ParserOptions::default()), - (r"(?<\>.)", ParserOptions::default().with_unicode_mode()), - ("(?)", ParserOptions::default()), - ("(?=a){1}", ParserOptions::default().with_unicode_mode()), - ("(?!a){1}", ParserOptions::default().with_unicode_mode()), - (r"[\d-\D]", ParserOptions::default().with_unicode_mode()), - ("[", ParserOptions::default()), - ("[", ParserOptions::default().with_unicode_sets_mode()), - ("[[", ParserOptions::default().with_unicode_sets_mode()), - ("[[]", ParserOptions::default().with_unicode_sets_mode()), - ("[z-a]", ParserOptions::default()), - (r"[a-c]]", ParserOptions::default().with_unicode_mode()), - ( - r"^([a-zªµºß-öø-ÿāăąćĉċčďđēĕėęěĝğġģĥħĩīĭįıijĵķ-ĸĺļľŀłńņň-ʼnŋōŏőœŕŗřśŝşšţťŧũūŭůűųŵŷźżž-ƀƃƅƈƌ-ƍƒƕƙ-ƛƞơƣƥƨƪ-ƫƭưƴƶƹ-ƺƽ-ƿdžljnjǎǐǒǔǖǘǚǜ-ǝǟǡǣǥǧǩǫǭǯ-ǰdzǵǹǻǽǿȁȃȅȇȉȋȍȏȑȓȕȗșțȝȟȡȣȥȧȩȫȭȯȱȳ-ȹȼȿ-ɀɂɇɉɋɍɏ-ʓʕ-ʯͱͳͷͻ-ͽΐά-ώϐ-ϑϕ-ϗϙϛϝϟϡϣϥϧϩϫϭϯ-ϳϵϸϻ-ϼа-џѡѣѥѧѩѫѭѯѱѳѵѷѹѻѽѿҁҋҍҏґғҕҗҙқҝҟҡңҥҧҩҫҭүұҳҵҷҹһҽҿӂӄӆӈӊӌӎ-ӏӑӓӕӗәӛӝӟӡӣӥӧөӫӭӯӱӳӵӷӹӻӽӿԁԃԅԇԉԋԍԏԑԓԕԗԙԛԝԟԡԣա-ևᴀ-ᴫᵢ-ᵷᵹ-ᶚḁḃḅḇḉḋḍḏḑḓḕḗḙḛḝḟḡḣḥḧḩḫḭḯḱḳḵḷḹḻḽḿṁṃṅṇṉṋṍṏṑṓṕṗṙṛṝṟṡṣṥṧṩṫṭṯṱṳṵṷṹṻṽṿẁẃẅẇẉẋẍẏẑẓẕ-ẝẟạảấầẩẫậắằẳẵặẹẻẽếềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹỻỽỿ-ἇἐ-ἕἠ-ἧἰ-ἷὀ-ὅὐ-ὗὠ-ὧὰ-ώᾀ-ᾇᾐ-ᾗᾠ-ᾧᾰ-ᾴᾶ-ᾷιῂ-ῄῆ-ῇῐ-ΐῖ-ῗῠ-ῧῲ-ῴῶ-ῷⁱⁿℊℎ-ℏℓℯℴℹℼ-ℽⅆ-ⅉⅎↄⰰ-ⱞⱡⱥ-ⱦⱨⱪⱬⱱⱳ-ⱴⱶ-ⱼⲁⲃⲅⲇⲉⲋⲍⲏⲑⲓⲕⲗⲙⲛⲝⲟⲡⲣⲥⲧⲩⲫⲭⲯⲱⲳⲵⲷⲹⲻⲽⲿⳁⳃⳅⳇⳉⳋⳍⳏⳑⳓⳕⳗⳙⳛⳝⳟⳡⳣ-ⳤⴀ-ⴥꙁꙃꙅꙇꙉꙋꙍꙏꙑꙓꙕꙗꙙꙛꙝꙟꙣꙥꙧꙩꙫꙭꚁꚃꚅꚇꚉꚋꚍꚏꚑꚓꚕꚗꜣꜥꜧꜩꜫꜭꜯ-ꜱꜳꜵꜷꜹꜻꜽꜿꝁꝃꝅꝇꝉꝋꝍꝏꝑꝓꝕꝗꝙꝛꝝꝟꝡꝣꝥꝧꝩꝫꝭꝯꝱ-ꝸꝺꝼꝿꞁꞃꞅꞇꞌff-stﬓ-ﬗa-z]|\ud801[\udc28-\udc4f]|\ud835[\udc1a-\udc33\udc4e-\udc54\udc56-\udc67\udc82-\udc9b\udcb6-\udcb9\udcbb\udcbd-\udcc3\udcc5-\udccf\udcea-\udd03\udd1e-\udd37\udd52-\udd6b\udd86-\udd9f\uddba-\uddd3\uddee-\ude07\ude22-\ude3b\ude56-\ude6f\ude8a-\udea5\udec2-\udeda\udedc-\udee1\udefc-\udf14\udf16-\udf1b\udf36-\udf4e\udf50-\udf55\udf70-\udf88\udf8a-\udf8f\udfaa-\udfc2\udfc4-\udfc9\udfcb])$", - ParserOptions::default(), - ), - (r"[[\d-\D]]", ParserOptions::default().with_unicode_sets_mode()), - (r"[a&&b--c]", ParserOptions::default().with_unicode_sets_mode()), - (r"[a--b&&c]", ParserOptions::default().with_unicode_sets_mode()), - (r"[\q{]", ParserOptions::default().with_unicode_sets_mode()), - (r"[\q{\a}]", ParserOptions::default().with_unicode_sets_mode()), - // TODO: ES2025 Duplicate named capturing groups - (r"(?..)|(?..)", ParserOptions::default()), // This will be valid - // (r"(?|(?))", ParserOptions::default()), // Nested, still invalid - ] { - assert!( - PatternParser::new(&allocator, source_text, *options).parse().is_err(), - "{source_text} should fail to parse with {options:?}!" - ); - } - } - - #[test] - fn should_fail_early_errors() { - let allocator = Allocator::default(); - - for (source_text, options, is_err) in &[ - // No tests for 4,294,967,295 left parens - (r"(?..)(?..)", ParserOptions::default(), true), - (r"a{2,1}", ParserOptions::default(), true), - (r"(?)\k", ParserOptions::default(), true), - (r"()\2", ParserOptions::default().with_unicode_mode(), true), - (r"[a-\d]", ParserOptions::default().with_unicode_mode(), true), - (r"[\d-z]", ParserOptions::default().with_unicode_mode(), true), - (r"[\d-\d]", ParserOptions::default().with_unicode_mode(), true), - (r"[z-a]", ParserOptions::default(), true), - (r"\u{110000}", ParserOptions::default().with_unicode_mode(), true), - (r"(?<\uD800\uDBFF>)", ParserOptions::default(), true), - (r"\u{0}\u{110000}", ParserOptions::default().with_unicode_mode(), true), - (r"(?)", ParserOptions::default(), true), - (r"\p{Foo=Bar}", ParserOptions::default().with_unicode_mode(), true), - (r"\p{Foo}", ParserOptions::default().with_unicode_mode(), true), - (r"\p{Basic_Emoji}", ParserOptions::default().with_unicode_mode(), true), - (r"\P{Basic_Emoji}", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[^\p{Basic_Emoji}]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\p{Basic_Emoji}]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[^\q{}]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\q{}]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\q{ng}]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\q{a|}]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\q{ng}\q{o|k}]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\q{o|k}\q{ng}\q{o|k}]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\q{o|k}\q{o|k}\q{ng}]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\q{}&&\q{ng}]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\q{ng}&&\q{o|k}]]", ParserOptions::default().with_unicode_sets_mode(), false), - ( - r"[[^\q{ng}&&\q{o|k}&&\q{ng}]]", - ParserOptions::default().with_unicode_sets_mode(), - false, - ), - (r"[[^\q{ng}--\q{o|k}]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\q{o|k}--\q{ng}]]", ParserOptions::default().with_unicode_sets_mode(), false), - (r"[[z-a]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[[[[^[[[[\q{ng}]]]]]]]]]", ParserOptions::default().with_unicode_sets_mode(), true), - ( - r"[^[[[[[[[[[[[[[[[[\q{ng}]]]]]]]]]]]]]]]]]", - ParserOptions::default().with_unicode_sets_mode(), - true, - ), - ] { - assert_eq!( - PatternParser::new(&allocator, source_text, *options).parse().is_err(), - *is_err, - "{source_text} should early error with {options:?}!" - ); - } - } - - #[test] - fn should_handle_empty() { - let allocator = Allocator::default(); - let pattern = PatternParser::new(&allocator, "", ParserOptions::default()).parse().unwrap(); - - assert_eq!(pattern.body.body[0].body.len(), 1); - } - - #[test] - fn should_handle_unicode() { - let allocator = Allocator::default(); - let source_text = "このEmoji🥹の数が変わる"; - - for (options, expected) in &[ - (ParserOptions::default(), 15), - (ParserOptions::default().with_unicode_mode(), 14), - (ParserOptions::default().with_unicode_sets_mode(), 14), - ] { - let pattern = PatternParser::new(&allocator, source_text, *options).parse().unwrap(); - assert_eq!(pattern.body.body[0].body.len(), *expected); - } - } -} diff --git a/crates/oxc_regular_expression/src/diagnostics.rs b/crates/oxc_regular_expression/src/diagnostics.rs index e0e6866d69e86..12945812331ab 100644 --- a/crates/oxc_regular_expression/src/diagnostics.rs +++ b/crates/oxc_regular_expression/src/diagnostics.rs @@ -3,43 +3,6 @@ use oxc_span::Span; const PREFIX: &str = "Invalid regular expression:"; -// For (Literal)Parser --- - -#[cold] -pub fn unexpected_literal_char(span: Span) -> OxcDiagnostic { - OxcDiagnostic::error(format!("{PREFIX} Unexpected literal character")).with_label(span) -} - -#[cold] -pub fn unterminated_literal(span: Span, kind: &str) -> OxcDiagnostic { - OxcDiagnostic::error(format!("{PREFIX} Unterminated {kind}")).with_label(span) -} - -#[cold] -pub fn empty_literal(span: Span) -> OxcDiagnostic { - OxcDiagnostic::error(format!("{PREFIX} Empty literal")).with_label(span) -} - -// For FlagsParser --- - -#[cold] -pub fn duplicated_flag(span: Span) -> OxcDiagnostic { - OxcDiagnostic::error(format!("{PREFIX} Duplicated flag")).with_label(span) -} - -#[cold] -pub fn unknown_flag(span: Span) -> OxcDiagnostic { - OxcDiagnostic::error(format!("{PREFIX} Unknown flag")).with_label(span) -} - -#[cold] -pub fn invalid_unicode_flags(span: Span) -> OxcDiagnostic { - OxcDiagnostic::error(format!("{PREFIX} Invalid flags, `u` and `v` should be used alone")) - .with_label(span) -} - -// For PatternParser --- - #[cold] pub fn duplicated_capturing_group_names(spans: Vec) -> OxcDiagnostic { OxcDiagnostic::error(format!("{PREFIX} Duplicated capturing group names")).with_labels(spans) diff --git a/crates/oxc_regular_expression/src/flags_parser.rs b/crates/oxc_regular_expression/src/flags_parser.rs deleted file mode 100644 index 6d6e7cd09b9bf..0000000000000 --- a/crates/oxc_regular_expression/src/flags_parser.rs +++ /dev/null @@ -1,68 +0,0 @@ -use oxc_allocator::Allocator; -use oxc_diagnostics::Result; -use rustc_hash::FxHashSet; - -use crate::{ast, diagnostics, options::ParserOptions, span_factory::SpanFactory}; - -pub struct FlagsParser<'a> { - source_text: &'a str, - // options: ParserOptions, - span_factory: SpanFactory, -} - -impl<'a> FlagsParser<'a> { - pub fn new(_allocator: &'a Allocator, source_text: &'a str, options: ParserOptions) -> Self { - Self { - source_text, - // options, - span_factory: SpanFactory::new(options.span_offset), - } - } - - pub fn parse(&mut self) -> Result { - let span = self.span_factory.create(0, self.source_text.len()); - let mut global = false; - let mut ignore_case = false; - let mut multiline = false; - let mut unicode = false; - let mut sticky = false; - let mut dot_all = false; - let mut has_indices = false; - let mut unicode_sets = false; - - let mut existing_flags = FxHashSet::default(); - for (idx, c) in self.source_text.char_indices() { - if !existing_flags.insert(c) { - return Err(diagnostics::duplicated_flag(self.span_factory.create(idx, idx))); - } - - match c { - 'g' => global = true, - 'i' => ignore_case = true, - 'm' => multiline = true, - 'u' => unicode = true, - 'y' => sticky = true, - 's' => dot_all = true, - 'd' => has_indices = true, - 'v' => unicode_sets = true, - _ => return Err(diagnostics::unknown_flag(self.span_factory.create(idx, idx))), - } - } - - if unicode && unicode_sets { - return Err(diagnostics::invalid_unicode_flags(span)); - } - - Ok(ast::Flags { - span, - global, - ignore_case, - multiline, - unicode, - sticky, - dot_all, - has_indices, - unicode_sets, - }) - } -} diff --git a/crates/oxc_regular_expression/src/generated/derive_clone_in.rs b/crates/oxc_regular_expression/src/generated/derive_clone_in.rs index d41d5daae7fcf..6f8de2299f1e4 100644 --- a/crates/oxc_regular_expression/src/generated/derive_clone_in.rs +++ b/crates/oxc_regular_expression/src/generated/derive_clone_in.rs @@ -8,34 +8,6 @@ use oxc_allocator::{Allocator, CloneIn}; #[allow(clippy::wildcard_imports)] use crate::ast::*; -impl<'old_alloc, 'new_alloc> CloneIn<'new_alloc> for RegularExpression<'old_alloc> { - type Cloned = RegularExpression<'new_alloc>; - fn clone_in(&self, allocator: &'new_alloc Allocator) -> Self::Cloned { - RegularExpression { - span: CloneIn::clone_in(&self.span, allocator), - pattern: CloneIn::clone_in(&self.pattern, allocator), - flags: CloneIn::clone_in(&self.flags, allocator), - } - } -} - -impl<'alloc> CloneIn<'alloc> for Flags { - type Cloned = Flags; - fn clone_in(&self, allocator: &'alloc Allocator) -> Self::Cloned { - Flags { - span: CloneIn::clone_in(&self.span, allocator), - global: CloneIn::clone_in(&self.global, allocator), - ignore_case: CloneIn::clone_in(&self.ignore_case, allocator), - multiline: CloneIn::clone_in(&self.multiline, allocator), - unicode: CloneIn::clone_in(&self.unicode, allocator), - sticky: CloneIn::clone_in(&self.sticky, allocator), - dot_all: CloneIn::clone_in(&self.dot_all, allocator), - has_indices: CloneIn::clone_in(&self.has_indices, allocator), - unicode_sets: CloneIn::clone_in(&self.unicode_sets, allocator), - } - } -} - impl<'old_alloc, 'new_alloc> CloneIn<'new_alloc> for Pattern<'old_alloc> { type Cloned = Pattern<'new_alloc>; fn clone_in(&self, allocator: &'new_alloc Allocator) -> Self::Cloned { diff --git a/crates/oxc_regular_expression/src/generated/derive_content_eq.rs b/crates/oxc_regular_expression/src/generated/derive_content_eq.rs index e21234c42117e..e4c3f8bd27df8 100644 --- a/crates/oxc_regular_expression/src/generated/derive_content_eq.rs +++ b/crates/oxc_regular_expression/src/generated/derive_content_eq.rs @@ -8,26 +8,6 @@ use oxc_span::cmp::ContentEq; #[allow(clippy::wildcard_imports)] use crate::ast::*; -impl<'a> ContentEq for RegularExpression<'a> { - fn content_eq(&self, other: &Self) -> bool { - ContentEq::content_eq(&self.pattern, &other.pattern) - && ContentEq::content_eq(&self.flags, &other.flags) - } -} - -impl ContentEq for Flags { - fn content_eq(&self, other: &Self) -> bool { - ContentEq::content_eq(&self.global, &other.global) - && ContentEq::content_eq(&self.ignore_case, &other.ignore_case) - && ContentEq::content_eq(&self.multiline, &other.multiline) - && ContentEq::content_eq(&self.unicode, &other.unicode) - && ContentEq::content_eq(&self.sticky, &other.sticky) - && ContentEq::content_eq(&self.dot_all, &other.dot_all) - && ContentEq::content_eq(&self.has_indices, &other.has_indices) - && ContentEq::content_eq(&self.unicode_sets, &other.unicode_sets) - } -} - impl<'a> ContentEq for Pattern<'a> { fn content_eq(&self, other: &Self) -> bool { ContentEq::content_eq(&self.body, &other.body) diff --git a/crates/oxc_regular_expression/src/generated/derive_content_hash.rs b/crates/oxc_regular_expression/src/generated/derive_content_hash.rs index 1d0fecfb1a298..0680061472d1e 100644 --- a/crates/oxc_regular_expression/src/generated/derive_content_hash.rs +++ b/crates/oxc_regular_expression/src/generated/derive_content_hash.rs @@ -10,26 +10,6 @@ use oxc_span::hash::ContentHash; #[allow(clippy::wildcard_imports)] use crate::ast::*; -impl<'a> ContentHash for RegularExpression<'a> { - fn content_hash(&self, state: &mut H) { - ContentHash::content_hash(&self.pattern, state); - ContentHash::content_hash(&self.flags, state); - } -} - -impl ContentHash for Flags { - fn content_hash(&self, state: &mut H) { - ContentHash::content_hash(&self.global, state); - ContentHash::content_hash(&self.ignore_case, state); - ContentHash::content_hash(&self.multiline, state); - ContentHash::content_hash(&self.unicode, state); - ContentHash::content_hash(&self.sticky, state); - ContentHash::content_hash(&self.dot_all, state); - ContentHash::content_hash(&self.has_indices, state); - ContentHash::content_hash(&self.unicode_sets, state); - } -} - impl<'a> ContentHash for Pattern<'a> { fn content_hash(&self, state: &mut H) { ContentHash::content_hash(&self.body, state); diff --git a/crates/oxc_regular_expression/src/lib.rs b/crates/oxc_regular_expression/src/lib.rs index 4bfce9caf4d49..c7c42f290537c 100644 --- a/crates/oxc_regular_expression/src/lib.rs +++ b/crates/oxc_regular_expression/src/lib.rs @@ -1,12 +1,9 @@ #![allow(clippy::missing_errors_doc)] mod ast_impl; -mod body_parser; mod diagnostics; -mod flags_parser; -mod literal_parser; mod options; -mod span_factory; +mod parser; mod surrogate_pair; mod generated { @@ -16,7 +13,4 @@ mod generated { } pub mod ast; -pub use crate::{ - ast_impl::visit, body_parser::PatternParser, flags_parser::FlagsParser, literal_parser::Parser, - options::ParserOptions, -}; +pub use crate::{ast_impl::visit, options::ParserOptions, parser::Parser}; diff --git a/crates/oxc_regular_expression/src/literal_parser.rs b/crates/oxc_regular_expression/src/literal_parser.rs deleted file mode 100644 index 56b7a2b10111d..0000000000000 --- a/crates/oxc_regular_expression/src/literal_parser.rs +++ /dev/null @@ -1,165 +0,0 @@ -use oxc_allocator::Allocator; -use oxc_diagnostics::Result; - -use crate::{ - ast, body_parser::PatternParser, diagnostics, flags_parser::FlagsParser, - options::ParserOptions, span_factory::SpanFactory, -}; - -/// LiteralParser -pub struct Parser<'a> { - allocator: &'a Allocator, - source_text: &'a str, - options: ParserOptions, - span_factory: SpanFactory, -} - -impl<'a> Parser<'a> { - pub fn new(allocator: &'a Allocator, source_text: &'a str, options: ParserOptions) -> Self { - Self { - allocator, - source_text, - options, - span_factory: SpanFactory::new(options.span_offset), - } - } - - pub fn parse(self) -> Result> { - // Precheck if the source text is a valid regular expression literal - // If valid, parse the pattern and flags with returned span offsets - let (body_start_offset, body_end_offset, flag_start_offset) = - parse_reg_exp_literal(self.source_text, &self.span_factory)?; - - // Parse flags first to know if unicode mode is enabled or not - let flags = FlagsParser::new( - self.allocator, - &self.source_text[flag_start_offset..], - #[allow(clippy::cast_possible_truncation)] - self.options.with_span_offset(self.options.span_offset + flag_start_offset as u32), - ) - .parse()?; - - // Then parse the pattern with the flags - let pattern_options = match (flags.unicode, flags.unicode_sets) { - (true, false) => self.options.with_unicode_mode(), - (_, true) => self.options.with_unicode_sets_mode(), - _ => self.options, - }; - - let pattern = PatternParser::new( - self.allocator, - &self.source_text[body_start_offset..body_end_offset], - #[allow(clippy::cast_possible_truncation)] - pattern_options.with_span_offset(self.options.span_offset + body_start_offset as u32), - ) - .parse()?; - - Ok(ast::RegularExpression { - span: self.span_factory.create(0, self.source_text.len()), - pattern, - flags, - }) - } -} - -/// Check passed source text is a valid regular expression literal. -/// ``` -/// / RegularExpressionBody / RegularExpressionFlags -/// ``` -/// Returns `(body_start_offset, body_end_offset, flag_start_offset)`. -fn parse_reg_exp_literal( - source_text: &str, - span_factory: &SpanFactory, -) -> Result<(usize, usize, usize)> { - let mut offset = 0; - let mut chars = source_text.chars().peekable(); - - let Some('/') = chars.next() else { - return Err(diagnostics::unexpected_literal_char(span_factory.create(offset, offset))); - }; - offset += 1; // '/' - - let body_start = offset; - - let mut in_escape = false; - let mut in_character_class = false; - loop { - match chars.peek() { - // Line terminators are not allowed - Some('\u{a}' | '\u{d}' | '\u{2028}' | '\u{2029}') | None => { - return Err(diagnostics::unterminated_literal( - span_factory.create(body_start, offset), - if in_character_class { "character class" } else { "regular expression" }, - )); - } - Some(&ch) => { - if in_escape { - in_escape = false; - } else if ch == '\\' { - in_escape = true; - } else if ch == '[' { - in_character_class = true; - } else if ch == ']' { - in_character_class = false; - } else if ch == '/' && !in_character_class - // `*` is not allowed as `RegularExpressionFirstChar` - || offset == body_start && ch == '*' - { - break; - } - - offset += ch.len_utf8(); - } - } - - chars.next(); - } - - let Some('/') = chars.next() else { - return Err(diagnostics::unexpected_literal_char(span_factory.create(offset, offset))); - }; - let body_end = offset; - - if body_end == body_start { - return Err(diagnostics::empty_literal(span_factory.create(0, body_end + 1))); - } - - Ok((body_start, body_end, body_end + 1)) -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn parse_valid_reg_exp_literal() { - for literal_text in [ - "/(?:)/", - "/abc/", - "/abcd/igsmv", - r"/\w+/u", - r"/foo\/bar|baz/i", - "/[a-z]/", - "/正規表現/u", - "/あっち👈🏻/i", - "/👈🏻こっち/u", - ] { - let (body_start_offset, body_end_offset, flag_start_offset) = - parse_reg_exp_literal(literal_text, &SpanFactory::new(0)) - .unwrap_or_else(|_| panic!("{literal_text} should be parsed")); - - let body_text = &literal_text[body_start_offset..body_end_offset]; - let flag_text = &literal_text[flag_start_offset..]; - assert_eq!(format!("/{body_text}/{flag_text}",), literal_text); - } - } - - #[test] - fn parse_invalid_reg_exp_literal() { - for literal_text in - ["", "foo", ":(", "a\nb", "/", "/x", "/y\nz/", "/1[\n]/", "//", "///", "/*abc/", "/\\/"] - { - assert!(parse_reg_exp_literal(literal_text, &SpanFactory::new(0)).is_err()); - } - } -} diff --git a/crates/oxc_regular_expression/src/options.rs b/crates/oxc_regular_expression/src/options.rs index a67d058129d81..bb16c147cc047 100644 --- a/crates/oxc_regular_expression/src/options.rs +++ b/crates/oxc_regular_expression/src/options.rs @@ -6,21 +6,28 @@ pub struct ParserOptions { pub unicode_mode: bool, /// Extended Unicode mode(`v` flag) enabled or not. pub unicode_sets_mode: bool, + // TODO: Add `handle_escape_with_quote_type` like option to support `new RegExp("with \"escape\"")` } impl ParserOptions { #[must_use] - pub fn with_span_offset(self, span_offset: u32) -> ParserOptions { + pub fn with_span_offset(self, span_offset: u32) -> Self { ParserOptions { span_offset, ..self } } #[must_use] - pub fn with_unicode_mode(self) -> ParserOptions { - ParserOptions { unicode_mode: true, ..self } - } + pub fn with_flags(self, flags: &str) -> Self { + let (mut unicode_mode, mut unicode_sets_mode) = (false, false); + for ch in flags.chars() { + if ch == 'u' { + unicode_mode = true; + } + if ch == 'v' { + unicode_mode = true; + unicode_sets_mode = true; + } + } - #[must_use] - pub fn with_unicode_sets_mode(self) -> ParserOptions { - ParserOptions { unicode_mode: true, unicode_sets_mode: true, ..self } + ParserOptions { unicode_mode, unicode_sets_mode, ..self } } } diff --git a/crates/oxc_regular_expression/src/parser/mod.rs b/crates/oxc_regular_expression/src/parser/mod.rs new file mode 100644 index 0000000000000..5bd7ccd6b9574 --- /dev/null +++ b/crates/oxc_regular_expression/src/parser/mod.rs @@ -0,0 +1,266 @@ +mod parser_impl; +mod reader; +mod span_factory; +mod state; +mod unicode; +mod unicode_property; + +pub use parser_impl::Parser; + +#[cfg(test)] +mod test { + use crate::{Parser, ParserOptions}; + use oxc_allocator::Allocator; + + fn default() -> ParserOptions { + ParserOptions::default() + } + fn with_unicode_mode() -> ParserOptions { + ParserOptions { unicode_mode: true, ..Default::default() } + } + fn with_unicode_sets_mode() -> ParserOptions { + ParserOptions { unicode_mode: true, unicode_sets_mode: true, ..Default::default() } + } + + #[test] + fn should_pass() { + let allocator = Allocator::default(); + + for (source_text, options) in &[ + ("", default()), + ("a", default()), + ("a+", default()), + ("a*", default()), + ("a?", default()), + ("^$^$^$", default()), + ("(?=a){1}", default()), + ("(?!a){1}", default()), + ("a{1}", default()), + ("a{1", default()), + ("a|{", default()), + ("a{", default()), + ("a{,", default()), + ("a{1,", default()), + ("a{1,}", default()), + ("a{1,2}", default()), + ("x{9007199254740991}", default()), + ("x{9007199254740991,9007199254740991}", default()), + ("a|b", default()), + ("a|b|c", default()), + ("a|b+?|c", default()), + ("a+b*?c{1}d{2,}e{3,4}?", default()), + (r"^(?=ab)\b(?!cd)(?<=ef)\B(?.)\x1f", default()), + ("a]", default()), + ("a}", default()), + ("]", default()), + ("[]", default()), + ("[a]", default()), + ("[ab]", default()), + ("[a-b]", default()), + ("[-]", default()), + ("[a-]", default()), + ("[-a]", default()), + ("[-a-]", default()), + (r"[a\-b]", default()), + (r"[-a-b]", default()), + (r"[a-b-]", default()), + (r"[a\-b-]", default()), + (r"[\[\]\-]", default()), + ("[a-z0-9]", default()), + ("[a-a]", default()), + (r"[\d-\D]", default()), + (r"^([\ud801[\udc28-\udc4f])$", default()), + (r"[a-c]]", default()), + ( + r"[ϗϙϛϝϟϡϣϥϧϩϫϭϯ-ϳϵϸϻ-ϼа-џѡѣѥѧѩѫѭѯѱѳѵѷѹѻѽѿҁҋҍҏґғҕҗҙқҝҟҡңҥҧҩҫҭүұҳҵҷҹһҽҿӂӄӆӈӊӌӎ-ӏӑӓӕӗәӛӝӟӡӣӥӧөӫӭӯӱӳӵӷӹӻӽӿԁԃԅԇԉԋԍԏԑԓԕԗԙԛԝԟԡԣա-ևᴀ-ᴫᵢ-ᵷᵹ-ᶚḁḃḅḇḉḋḍḏḑḓḕḗḙḛḝḟḡḣḥḧḩḫḭḯḱḳḵḷḹḻḽḿṁṃṅṇṉṋṍṏṑṓṕṗṙṛṝṟṡṣṥṧṩṫṭṯṱṳṵṷṹṻṽṿẁẃẅẇẉẋẍẏẑẓẕ-ẝẟạảấầẩẫậắằẳẵặẹẻẽếềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹỻỽỿ-ἇἐ-ἕἠ-ἧἰ-ἷὀ-ὅὐ-ὗὠ-ὧὰ]", + default(), + ), + (r"[a-z0-9[.\\]]", with_unicode_sets_mode()), + (r"[a&&b&&c]", with_unicode_sets_mode()), + (r"[a--b--c]", with_unicode_sets_mode()), + (r"[[a-z]--b--c]", with_unicode_sets_mode()), + (r"[[[[[[[[[[[[[[[[[[[[[[[[a]]]]]]]]]]]]]]]]]]]]]]]]", with_unicode_sets_mode()), + (r"[\q{}\q{a}\q{bc}\q{d|e|f}\q{|||}]", with_unicode_sets_mode()), + (r"(?A)\k", default()), + (r"(?)\k", default()), + (r"\k", default()), + (r"\k<4>", default()), + (r"\k", default()), + (r"(?)\k", default()), + (r"(?)\k", with_unicode_mode()), + (r"\1", default()), + (r"\1()", default()), + (r"\1()", with_unicode_mode()), + (r"(?..)(?..)", default()), + // TODO: ES2025 Duplicate named capturing groups + // (r"(?..)|(?..)", default()), + // (r"(?[0-9]{4})-[0-9]{2}|[0-9]{2}-(?[0-9]{4})", default()), + // (r"(?:(?x)|(?y))\k", default()), + ] { + let res = Parser::new(&allocator, source_text, *options).parse(); + if let Err(err) = res { + panic!("Failed to parse {source_text} with {options:?}\n💥 {err}"); + } + } + } + + #[test] + fn should_fail() { + let allocator = Allocator::default(); + + for (source_text, options) in &[ + ("a)", default()), + (r"a\", default()), + ("a]", with_unicode_mode()), + ("a}", with_unicode_mode()), + ("a|+", default()), + ("a|{", with_unicode_mode()), + ("a{", with_unicode_mode()), + ("a{1", with_unicode_mode()), + ("a{1,", with_unicode_mode()), + ("a{,", with_unicode_mode()), + ("x{9007199254740992}", default()), + ("x{9007199254740991,9007199254740992}", default()), + ("x{99999999999999999999999999999999999999999999999999}", default()), + (r"\99999999999999999999999999999999999999999999999999", default()), + (r"\u{FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF}", with_unicode_mode()), + ("(?=a", default()), + ("(?", with_unicode_mode()), + (r"\k<4>", with_unicode_mode()), + (r"\k", with_unicode_mode()), + ("a(?:", default()), + ("(a", default()), + ("(?", default()), + (r"(?.)", default()), + (r"(?.)", with_unicode_mode()), + (r"(?<\>.)", default()), + (r"(?<\>.)", with_unicode_mode()), + ("(?)", default()), + ("(?=a){1}", with_unicode_mode()), + ("(?!a){1}", with_unicode_mode()), + (r"[\d-\D]", with_unicode_mode()), + ("[", default()), + ("[", with_unicode_sets_mode()), + ("[[", with_unicode_sets_mode()), + ("[[]", with_unicode_sets_mode()), + ("[z-a]", default()), + (r"[a-c]]", with_unicode_mode()), + ( + r"^([a-zªµºß-öø-ÿāăąćĉċčďđēĕėęěĝğġģĥħĩīĭįıijĵķ-ĸĺļľŀłńņň-ʼnŋōŏőœŕŗřśŝşšţťŧũūŭůűųŵŷźżž-ƀƃƅƈƌ-ƍƒƕƙ-ƛƞơƣƥƨƪ-ƫƭưƴƶƹ-ƺƽ-ƿdžljnjǎǐǒǔǖǘǚǜ-ǝǟǡǣǥǧǩǫǭǯ-ǰdzǵǹǻǽǿȁȃȅȇȉȋȍȏȑȓȕȗșțȝȟȡȣȥȧȩȫȭȯȱȳ-ȹȼȿ-ɀɂɇɉɋɍɏ-ʓʕ-ʯͱͳͷͻ-ͽΐά-ώϐ-ϑϕ-ϗϙϛϝϟϡϣϥϧϩϫϭϯ-ϳϵϸϻ-ϼа-џѡѣѥѧѩѫѭѯѱѳѵѷѹѻѽѿҁҋҍҏґғҕҗҙқҝҟҡңҥҧҩҫҭүұҳҵҷҹһҽҿӂӄӆӈӊӌӎ-ӏӑӓӕӗәӛӝӟӡӣӥӧөӫӭӯӱӳӵӷӹӻӽӿԁԃԅԇԉԋԍԏԑԓԕԗԙԛԝԟԡԣա-ևᴀ-ᴫᵢ-ᵷᵹ-ᶚḁḃḅḇḉḋḍḏḑḓḕḗḙḛḝḟḡḣḥḧḩḫḭḯḱḳḵḷḹḻḽḿṁṃṅṇṉṋṍṏṑṓṕṗṙṛṝṟṡṣṥṧṩṫṭṯṱṳṵṷṹṻṽṿẁẃẅẇẉẋẍẏẑẓẕ-ẝẟạảấầẩẫậắằẳẵặẹẻẽếềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹỻỽỿ-ἇἐ-ἕἠ-ἧἰ-ἷὀ-ὅὐ-ὗὠ-ὧὰ-ώᾀ-ᾇᾐ-ᾗᾠ-ᾧᾰ-ᾴᾶ-ᾷιῂ-ῄῆ-ῇῐ-ΐῖ-ῗῠ-ῧῲ-ῴῶ-ῷⁱⁿℊℎ-ℏℓℯℴℹℼ-ℽⅆ-ⅉⅎↄⰰ-ⱞⱡⱥ-ⱦⱨⱪⱬⱱⱳ-ⱴⱶ-ⱼⲁⲃⲅⲇⲉⲋⲍⲏⲑⲓⲕⲗⲙⲛⲝⲟⲡⲣⲥⲧⲩⲫⲭⲯⲱⲳⲵⲷⲹⲻⲽⲿⳁⳃⳅⳇⳉⳋⳍⳏⳑⳓⳕⳗⳙⳛⳝⳟⳡⳣ-ⳤⴀ-ⴥꙁꙃꙅꙇꙉꙋꙍꙏꙑꙓꙕꙗꙙꙛꙝꙟꙣꙥꙧꙩꙫꙭꚁꚃꚅꚇꚉꚋꚍꚏꚑꚓꚕꚗꜣꜥꜧꜩꜫꜭꜯ-ꜱꜳꜵꜷꜹꜻꜽꜿꝁꝃꝅꝇꝉꝋꝍꝏꝑꝓꝕꝗꝙꝛꝝꝟꝡꝣꝥꝧꝩꝫꝭꝯꝱ-ꝸꝺꝼꝿꞁꞃꞅꞇꞌff-stﬓ-ﬗa-z]|\ud801[\udc28-\udc4f]|\ud835[\udc1a-\udc33\udc4e-\udc54\udc56-\udc67\udc82-\udc9b\udcb6-\udcb9\udcbb\udcbd-\udcc3\udcc5-\udccf\udcea-\udd03\udd1e-\udd37\udd52-\udd6b\udd86-\udd9f\uddba-\uddd3\uddee-\ude07\ude22-\ude3b\ude56-\ude6f\ude8a-\udea5\udec2-\udeda\udedc-\udee1\udefc-\udf14\udf16-\udf1b\udf36-\udf4e\udf50-\udf55\udf70-\udf88\udf8a-\udf8f\udfaa-\udfc2\udfc4-\udfc9\udfcb])$", + default(), + ), + (r"[[\d-\D]]", with_unicode_sets_mode()), + (r"[a&&b--c]", with_unicode_sets_mode()), + (r"[a--b&&c]", with_unicode_sets_mode()), + (r"[\q{]", with_unicode_sets_mode()), + (r"[\q{\a}]", with_unicode_sets_mode()), + // TODO: ES2025 Duplicate named capturing groups + (r"(?..)|(?..)", default()), // This will be valid + // (r"(?|(?))", default()), // Nested, still invalid + ] { + assert!( + Parser::new(&allocator, source_text, *options).parse().is_err(), + "{source_text} should fail to parse with {options:?}!" + ); + } + } + + #[test] + fn should_fail_early_errors() { + let allocator = Allocator::default(); + + for (source_text, options, is_err) in &[ + // No tests for 4,294,967,295 left parens + (r"(?..)(?..)", default(), true), + (r"a{2,1}", default(), true), + (r"(?)\k", default(), true), + (r"()\2", with_unicode_mode(), true), + (r"[a-\d]", with_unicode_mode(), true), + (r"[\d-z]", with_unicode_mode(), true), + (r"[\d-\d]", with_unicode_mode(), true), + (r"[z-a]", default(), true), + (r"\u{110000}", with_unicode_mode(), true), + (r"(?<\uD800\uDBFF>)", default(), true), + (r"\u{0}\u{110000}", with_unicode_mode(), true), + (r"(?)", default(), true), + (r"\p{Foo=Bar}", with_unicode_mode(), true), + (r"\p{Foo}", with_unicode_mode(), true), + (r"\p{Basic_Emoji}", with_unicode_mode(), true), + (r"\P{Basic_Emoji}", with_unicode_sets_mode(), true), + (r"[^\p{Basic_Emoji}]", with_unicode_sets_mode(), true), + (r"[[^\p{Basic_Emoji}]]", with_unicode_sets_mode(), true), + (r"[^\q{}]", with_unicode_sets_mode(), true), + (r"[[^\q{}]]", with_unicode_sets_mode(), true), + (r"[[^\q{ng}]]", with_unicode_sets_mode(), true), + (r"[[^\q{a|}]]", with_unicode_sets_mode(), true), + (r"[[^\q{ng}\q{o|k}]]", with_unicode_sets_mode(), true), + (r"[[^\q{o|k}\q{ng}\q{o|k}]]", with_unicode_sets_mode(), true), + (r"[[^\q{o|k}\q{o|k}\q{ng}]]", with_unicode_sets_mode(), true), + (r"[[^\q{}&&\q{ng}]]", with_unicode_sets_mode(), true), + (r"[[^\q{ng}&&\q{o|k}]]", with_unicode_sets_mode(), false), + (r"[[^\q{ng}&&\q{o|k}&&\q{ng}]]", with_unicode_sets_mode(), false), + (r"[[^\q{ng}--\q{o|k}]]", with_unicode_sets_mode(), true), + (r"[[^\q{o|k}--\q{ng}]]", with_unicode_sets_mode(), false), + (r"[[z-a]]", with_unicode_sets_mode(), true), + (r"[[[[[^[[[[\q{ng}]]]]]]]]]", with_unicode_sets_mode(), true), + (r"[^[[[[[[[[[[[[[[[[\q{ng}]]]]]]]]]]]]]]]]]", with_unicode_sets_mode(), true), + ] { + assert_eq!( + Parser::new(&allocator, source_text, *options).parse().is_err(), + *is_err, + "{source_text} should early error with {options:?}!" + ); + } + } + + #[test] + fn should_handle_empty() { + let allocator = Allocator::default(); + let pattern = Parser::new(&allocator, "", default()).parse().unwrap(); + + assert_eq!(pattern.body.body[0].body.len(), 1); + } + + #[test] + fn should_handle_unicode() { + let allocator = Allocator::default(); + let source_text = "このEmoji🥹の数が変わる"; + + for (options, expected) in + &[(default(), 15), (with_unicode_mode(), 14), (with_unicode_sets_mode(), 14)] + { + let pattern = Parser::new(&allocator, source_text, *options).parse().unwrap(); + assert_eq!(pattern.body.body[0].body.len(), *expected); + } + } +} diff --git a/crates/oxc_regular_expression/src/body_parser/parser.rs b/crates/oxc_regular_expression/src/parser/parser_impl.rs similarity index 99% rename from crates/oxc_regular_expression/src/body_parser/parser.rs rename to crates/oxc_regular_expression/src/parser/parser_impl.rs index 9b687f358b5db..a90559a2844d0 100644 --- a/crates/oxc_regular_expression/src/body_parser/parser.rs +++ b/crates/oxc_regular_expression/src/parser/parser_impl.rs @@ -3,15 +3,13 @@ use oxc_diagnostics::Result; use oxc_span::Atom as SpanAtom; use crate::{ - ast, - body_parser::{reader::Reader, state::State, unicode, unicode_property}, - diagnostics, + ast, diagnostics, options::ParserOptions, - span_factory::SpanFactory, + parser::{reader::Reader, span_factory::SpanFactory, state::State, unicode, unicode_property}, surrogate_pair, }; -pub struct PatternParser<'a> { +pub struct Parser<'a> { allocator: &'a Allocator, source_text: &'a str, span_factory: SpanFactory, @@ -19,7 +17,7 @@ pub struct PatternParser<'a> { state: State<'a>, } -impl<'a> PatternParser<'a> { +impl<'a> Parser<'a> { pub fn new(allocator: &'a Allocator, source_text: &'a str, options: ParserOptions) -> Self { // `RegExp` can not be empty. // - Literal `//` means just a single line comment @@ -35,7 +33,7 @@ impl<'a> PatternParser<'a> { } } - pub fn parse(&mut self) -> Result> { + pub fn parse(mut self) -> Result> { // Pre parse whole pattern to collect: // - the number of (named|unnamed) capturing groups // - For `\1` in `\1()` to be handled as indexed reference @@ -757,7 +755,7 @@ impl<'a> PatternParser<'a> { let (kind, body) = self.parse_class_contents()?; if self.reader.eat(']') { - let strings = PatternParser::may_contain_strings_in_class_contents(&kind, &body); + let strings = Parser::may_contain_strings_in_class_contents(&kind, &body); // [SS:EE] CharacterClass :: [^ ClassContents ] // It is a Syntax Error if MayContainStrings of the ClassContents is true. @@ -1317,7 +1315,7 @@ impl<'a> PatternParser<'a> { let (kind, body) = self.parse_class_contents()?; if self.reader.eat(']') { - let strings = PatternParser::may_contain_strings_in_class_contents(&kind, &body); + let strings = Parser::may_contain_strings_in_class_contents(&kind, &body); // [SS:EE] NestedClass :: [^ ClassContents ] // It is a Syntax Error if MayContainStrings of the ClassContents is true. diff --git a/crates/oxc_regular_expression/src/body_parser/reader.rs b/crates/oxc_regular_expression/src/parser/reader.rs similarity index 100% rename from crates/oxc_regular_expression/src/body_parser/reader.rs rename to crates/oxc_regular_expression/src/parser/reader.rs diff --git a/crates/oxc_regular_expression/src/span_factory.rs b/crates/oxc_regular_expression/src/parser/span_factory.rs similarity index 100% rename from crates/oxc_regular_expression/src/span_factory.rs rename to crates/oxc_regular_expression/src/parser/span_factory.rs diff --git a/crates/oxc_regular_expression/src/body_parser/state.rs b/crates/oxc_regular_expression/src/parser/state.rs similarity index 99% rename from crates/oxc_regular_expression/src/body_parser/state.rs rename to crates/oxc_regular_expression/src/parser/state.rs index 1ddcd181b8e04..50adbd1ee3715 100644 --- a/crates/oxc_regular_expression/src/body_parser/state.rs +++ b/crates/oxc_regular_expression/src/parser/state.rs @@ -1,6 +1,6 @@ use rustc_hash::FxHashSet; -use crate::body_parser::reader::Reader; +use crate::parser::reader::Reader; /// Currently all of properties are read only from outside of this module. /// Even inside of this module, it is not changed after initialized. diff --git a/crates/oxc_regular_expression/src/body_parser/unicode.rs b/crates/oxc_regular_expression/src/parser/unicode.rs similarity index 100% rename from crates/oxc_regular_expression/src/body_parser/unicode.rs rename to crates/oxc_regular_expression/src/parser/unicode.rs diff --git a/crates/oxc_regular_expression/src/body_parser/unicode_property.rs b/crates/oxc_regular_expression/src/parser/unicode_property.rs similarity index 100% rename from crates/oxc_regular_expression/src/body_parser/unicode_property.rs rename to crates/oxc_regular_expression/src/parser/unicode_property.rs diff --git a/crates/oxc_transformer/src/regexp/mod.rs b/crates/oxc_transformer/src/regexp/mod.rs index 2fb4334945b61..05ebee848784e 100644 --- a/crates/oxc_transformer/src/regexp/mod.rs +++ b/crates/oxc_transformer/src/regexp/mod.rs @@ -241,12 +241,10 @@ fn try_parse_pattern<'a>( flags: RegExpFlags, ctx: &mut TraverseCtx<'a>, ) -> Result> { - use oxc_regular_expression::{ParserOptions, PatternParser}; - - let options = ParserOptions { - span_offset: span.start + 1, // exclude `/` - unicode_mode: flags.contains(RegExpFlags::U) || flags.contains(RegExpFlags::V), - unicode_sets_mode: flags.contains(RegExpFlags::V), - }; - PatternParser::new(ctx.ast.allocator, raw, options).parse() + use oxc_regular_expression::{Parser, ParserOptions}; + + let options = ParserOptions::default() + .with_span_offset(span.start + 1) // exclude `/` + .with_flags(&flags.to_string()); + Parser::new(ctx.ast.allocator, raw, options).parse() } diff --git a/tasks/coverage/src/driver.rs b/tasks/coverage/src/driver.rs index d22fb31aee873..670ff6e9a0b0e 100644 --- a/tasks/coverage/src/driver.rs +++ b/tasks/coverage/src/driver.rs @@ -2,15 +2,12 @@ use std::{ops::ControlFlow, path::PathBuf}; use oxc::{ allocator::Allocator, - ast::{ - ast::{Program, RegExpFlags}, - Trivias, - }, + ast::{ast::Program, Trivias}, codegen::CodegenOptions, diagnostics::OxcDiagnostic, minifier::CompressOptions, parser::{ParseOptions, ParserReturn}, - regular_expression::{ParserOptions, PatternParser}, + regular_expression::{Parser, ParserOptions}, semantic::{ post_transform_checker::{check_semantic_after_transform, check_semantic_ids}, Semantic, SemanticBuilderReturn, @@ -166,15 +163,11 @@ impl Driver { continue; }; let printed1 = pattern.to_string(); - let flags = literal.regex.flags; - let printed2 = match PatternParser::new( + let flags = literal.regex.flags.to_string(); + let printed2 = match Parser::new( &allocator, &printed1, - ParserOptions { - span_offset: 0, - unicode_mode: flags.contains(RegExpFlags::U) || flags.contains(RegExpFlags::V), - unicode_sets_mode: flags.contains(RegExpFlags::V), - }, + ParserOptions::default().with_flags(&flags), ) .parse() {