Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(regular_expression): Intro ConstructorParser(and LiteralParser) to handle escape sequence in RegExp('pat') #6635

Merged
merged 1 commit into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions crates/oxc_regular_expression/examples/parse_literal.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#![allow(clippy::print_stdout)]

use oxc_allocator::Allocator;
use oxc_regular_expression::{Parser, ParserOptions};
use oxc_regular_expression::{LiteralParser, Options};

fn main() {
let allocator = Allocator::default();

for (pattern, flags) in [
for (pattern_text, flags_text) in [
(r"ab", ""),
(r"abc", "i"),
(r"abcd", "igv"),
Expand Down Expand Up @@ -43,14 +43,16 @@ fn main() {
(r"[\bb]", ""),
(r"a{2,1}", "v"), // Error
] {
let parser = Parser::new(
let parser = LiteralParser::new(
&allocator,
pattern,
ParserOptions::default().with_span_offset(1).with_flags(flags),
pattern_text,
Some(flags_text),
// +1 for added `/` in error reports
Options { pattern_span_offset: 1, ..Options::default() },
);
let ret = parser.parse();

let literal = format!("/{pattern}/{flags}");
let literal = format!("/{pattern_text}/{flags_text}");
println!("Parse: {literal}");
match ret {
Ok(pattern) => {
Expand Down
4 changes: 2 additions & 2 deletions crates/oxc_regular_expression/examples/regex_visitor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use oxc_allocator::Allocator;
use oxc_regular_expression::{
visit::{RegExpAstKind, Visit},
Parser, ParserOptions,
LiteralParser, Options,
};
use oxc_span::GetSpan;

Expand All @@ -23,7 +23,7 @@ fn main() {
let source_text = r"(https?:\/\/github\.com\/(([^\s]+)\/([^\s]+))\/([^\s]+\/)?(issues|pull)\/([0-9]+))|(([^\s]+)\/([^\s]+))?#([1-9][0-9]*)($|[\s\:\;\-\(\=])";

let allocator = Allocator::default();
let parser = Parser::new(&allocator, source_text, ParserOptions::default());
let parser = LiteralParser::new(&allocator, source_text, None, Options::default());
let pattern = parser.parse().unwrap();

let mut visitor = TestVisitor;
Expand Down
12 changes: 4 additions & 8 deletions crates/oxc_regular_expression/src/ast_impl/display.rs
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ where

#[cfg(test)]
mod test {
use crate::{Parser, ParserOptions};
use crate::{LiteralParser, Options};
use oxc_allocator::Allocator;

type Case<'a> = (
Expand Down Expand Up @@ -557,13 +557,9 @@ mod test {
let pattern = &input[left_slash + 1..right_slash];
let flags = &input[right_slash + 1..];

let actual = Parser::new(
allocator,
pattern,
ParserOptions::default().with_span_offset(1).with_flags(flags),
)
.parse()
.unwrap();
let actual = LiteralParser::new(allocator, pattern, Some(flags), Options::default())
.parse()
.unwrap();

let expect = output.unwrap_or(input);
assert_eq!(expect, format!("/{actual}/{flags}")); // This uses `Display` impls
Expand Down
25 changes: 25 additions & 0 deletions crates/oxc_regular_expression/src/diagnostics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,31 @@ use oxc_span::Span;

const PREFIX: &str = "Invalid regular expression:";

#[cold]
pub fn invalid_input(span: Span) -> OxcDiagnostic {
OxcDiagnostic::error(format!("{PREFIX} Invalid input string literal")).with_label(span)
}

// ---

#[cold]
pub fn unknown_flag(span: Span, flag: &str) -> OxcDiagnostic {
OxcDiagnostic::error(format!("{PREFIX} Unknown flag: `{flag}` found")).with_label(span)
}

#[cold]
pub fn duplicated_flags(span: Span, flag: &str) -> OxcDiagnostic {
OxcDiagnostic::error(format!("{PREFIX} Duplicated flag: `{flag}` found")).with_label(span)
}

#[cold]
pub fn invalid_unicode_flags(span: Span) -> OxcDiagnostic {
OxcDiagnostic::error(format!("{PREFIX} Invalid unicode flags combination `u` and `v`"))
.with_label(span)
}

// ---

#[cold]
pub fn duplicated_capturing_group_names(spans: Vec<Span>) -> OxcDiagnostic {
OxcDiagnostic::error(format!("{PREFIX} Duplicated capturing group names")).with_labels(spans)
Expand Down
90 changes: 89 additions & 1 deletion crates/oxc_regular_expression/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,92 @@ mod generated {
}

pub mod ast;
pub use crate::{ast_impl::visit, options::ParserOptions, parser::Parser};
pub use crate::{
ast_impl::visit,
options::Options,
parser::{ConstructorParser, LiteralParser},
};

// LEGACY APIS TO BE REMOVED SOON! ============================================

#[derive(Clone, Copy, Debug, Default)]
pub struct ParserOptions {
pub span_offset: u32,
pub unicode_mode: bool,
pub unicode_sets_mode: bool,
pub parse_string_literal: bool,
}

impl ParserOptions {
#[must_use]
pub fn with_span_offset(self, span_offset: u32) -> Self {
ParserOptions { span_offset, ..self }
}

#[must_use]
pub fn with_flags(self, flags: &str) -> Self {
let (mut unicode_mode, mut unicode_sets_mode) = (false, false);
for ch in flags.chars() {
if ch == 'u' {
unicode_mode = true;
}
if ch == 'v' {
unicode_mode = true;
unicode_sets_mode = true;
}
}

ParserOptions { unicode_mode, unicode_sets_mode, ..self }
}

#[must_use]
pub fn with_parse_string_literal(self) -> Self {
ParserOptions { parse_string_literal: true, ..self }
}
}

pub struct Parser<'a> {
allocator: &'a oxc_allocator::Allocator,
source_text: &'a str,
options: ParserOptions,
}

impl<'a> Parser<'a> {
pub fn new(
allocator: &'a oxc_allocator::Allocator,
source_text: &'a str,
options: ParserOptions,
) -> Self {
Self { allocator, source_text, options }
}

pub fn parse(self) -> oxc_diagnostics::Result<crate::ast::Pattern<'a>> {
let ParserOptions { unicode_mode, unicode_sets_mode, span_offset, parse_string_literal } =
self.options;

let options = Options {
pattern_span_offset: span_offset,
flags_span_offset: 0, // Never be used
};

if parse_string_literal {
#[allow(clippy::match_same_arms)]
let flags_text = match (unicode_mode, unicode_sets_mode) {
(true, false) => Some("'u'"),
(false, true) => Some("'v'"),
(true, true) => Some("'v'"), // Do not validate this here
(false, false) => None,
};
ConstructorParser::new(self.allocator, self.source_text, flags_text, options).parse()
} else {
#[allow(clippy::match_same_arms)]
let flags_text = match (unicode_mode, unicode_sets_mode) {
(true, false) => Some("u"),
(false, true) => Some("v"),
(true, true) => Some("v"), // Do not validate this here
(false, false) => None,
};
LiteralParser::new(self.allocator, self.source_text, flags_text, options).parse()
}
}
}
36 changes: 5 additions & 31 deletions crates/oxc_regular_expression/src/options.rs
Original file line number Diff line number Diff line change
@@ -1,33 +1,7 @@
#[derive(Clone, Copy, Debug, Default)]
pub struct ParserOptions {
/// Used to adjust Span positions to fit the global source code.
pub span_offset: u32,
/// Unicode mode(`u` or `v` flag) enabled or not.
pub unicode_mode: bool,
/// Extended Unicode mode(`v` flag) enabled or not.
pub unicode_sets_mode: bool,
// TODO: Add `handle_escape_with_quote_type` like option to support `new RegExp("with \"escape\"")`
}

impl ParserOptions {
#[must_use]
pub fn with_span_offset(self, span_offset: u32) -> Self {
ParserOptions { span_offset, ..self }
}

#[must_use]
pub fn with_flags(self, flags: &str) -> Self {
let (mut unicode_mode, mut unicode_sets_mode) = (false, false);
for ch in flags.chars() {
if ch == 'u' {
unicode_mode = true;
}
if ch == 'v' {
unicode_mode = true;
unicode_sets_mode = true;
}
}

ParserOptions { unicode_mode, unicode_sets_mode, ..self }
}
pub struct Options {
/// Used to adjust `Span` positions to fit the global source code.
pub pattern_span_offset: u32,
/// Used to adjust `Span` positions to fit the global source code.
pub flags_span_offset: u32,
}
119 changes: 119 additions & 0 deletions crates/oxc_regular_expression/src/parser/flags_parser.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
use oxc_diagnostics::Result;
use rustc_hash::FxHashSet;

use crate::{
diagnostics,
parser::{reader::Reader, span_factory::SpanFactory},
};

pub struct FlagsParser<'a> {
reader: Reader<'a>,
span_factory: SpanFactory,
}

impl<'a> FlagsParser<'a> {
pub fn new(reader: Reader<'a>, span_offset: u32) -> Self {
Self { reader, span_factory: SpanFactory::new(span_offset) }
}

/// Returns: (is_unicode_mode, is_unicode_sets_mode)
pub fn parse(mut self) -> Result<(bool, bool)> {
let mut is_unicode_mode = false;
let mut is_unicode_sets_mode = false;
let mut unique_flags = FxHashSet::default();

while let Some(cp) = self.reader.peek() {
let span_start = self.reader.offset();
self.reader.advance();
let span_end = self.reader.offset();

if unique_flags.contains(&cp) {
return Err(diagnostics::duplicated_flags(
self.span_factory.create(span_start, span_end),
&self.reader.atom(span_start, span_end),
));
}
if char::try_from(cp)
.map_or(true, |c| !matches!(c, 'd' | 'g' | 'i' | 'm' | 's' | 'u' | 'v' | 'y'))
{
return Err(diagnostics::unknown_flag(
self.span_factory.create(span_start, span_end),
&self.reader.atom(span_start, span_end),
));
}

if cp == 'u' as u32 {
if unique_flags.contains(&('v' as u32)) {
return Err(diagnostics::invalid_unicode_flags(
self.span_factory.create(span_start, span_end),
));
}
is_unicode_mode = true;
}
if cp == 'v' as u32 {
if unique_flags.contains(&('u' as u32)) {
return Err(diagnostics::invalid_unicode_flags(
self.span_factory.create(span_start, span_end),
));
}
is_unicode_mode = true;
is_unicode_sets_mode = true;
}

unique_flags.insert(cp);
}

Ok((is_unicode_mode, is_unicode_sets_mode))
}
}

#[cfg(test)]
mod test {
use super::*;

#[test]
fn should_pass() {
for (flags_text, expected) in &[
("", (false, false)),
("i", (false, false)),
("u", (true, false)),
("v", (true, true)),
("vg", (true, true)),
] {
let reader = Reader::initialize(flags_text, true, false).unwrap();
let result = FlagsParser::new(reader, 0).parse().unwrap();
assert_eq!(result, *expected);
}
}

#[test]
fn should_fail() {
for flags_text in &["uv", "vu", "uu", "vv", "gg", "$"] {
let reader = Reader::initialize(flags_text, true, false).unwrap();
let err = FlagsParser::new(reader, 0).parse();
assert!(err.is_err());
// println!("{:?}", err.unwrap_err().with_source_code(*flags_text));
}
for flags_text in &[r#""uv""#, "'V'", "\"-\"", r#""\162""#] {
let reader = Reader::initialize(flags_text, true, true).unwrap();
let err = FlagsParser::new(reader, 0).parse();
assert!(err.is_err());
// println!("{:?}", err.unwrap_err().with_source_code(*flags_text));
}
}

#[test]
fn string_literal() {
for reader in [
Reader::initialize("u", true, false).unwrap(),
Reader::initialize("'u'", true, true).unwrap(),
Reader::initialize(r#""\165""#, true, true).unwrap(),
Reader::initialize(r#""\x75""#, true, true).unwrap(),
Reader::initialize(r#""\u0075""#, true, true).unwrap(),
Reader::initialize(r#""\u{0075}""#, true, true).unwrap(),
] {
let result = FlagsParser::new(reader, 0).parse().unwrap();
assert_eq!(result, (true, false));
}
}
}
Loading
Loading