From 223b153249386492ac60d47a8806a4aa465b9587 Mon Sep 17 00:00:00 2001 From: Demali-876 <90882773+Demali-876@users.noreply.github.com> Date: Tue, 8 Oct 2024 12:17:28 -0400 Subject: [PATCH] Initial Commit, Changelog update to follow --- CHANGELOG.md | 27 +- src/motoko_regex/Compiler.mo | 560 ++++++++++++++++++--------------- src/motoko_regex/Extensions.mo | 10 +- src/motoko_regex/Lexer.mo | 17 +- src/motoko_regex/Optimizer.mo | 4 +- src/motoko_regex/Parser.mo | 183 +++++++---- src/motoko_regex/Regex.mo | 59 ++-- src/motoko_regex/Types.mo | 61 ++-- src/motoko_regex/teststrings | 3 +- 9 files changed, 531 insertions(+), 393 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f24f33a..f7a0552 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,9 +3,11 @@ ## 🚀 New Features - **Flattened AST Structure** - - Introduced a new, flattened AST structure to simplify regex expression handling. This includes using lists for concatenations and alternations, reducing the depth of nested expressions. + - [#420f70a](https://github.com/Demali-876/motoko_regex_engine/commit/420f70a46a0ace335d2be8631b2c372022b8f2f2) **Date: 10-07-2024 | New:** Introduced a new, flattened AST structure to simplify regex expression handling. This includes using lists for concatenations and alternations, reducing the depth of nested expressions. - **Single Group Token** - - Introduced a unified `#Group` token that now encapsulates the group modifier and sub-expression, streamlining group handling in both the lexer and the parser. + - [#420f70a](https://github.com/Demali-876/motoko_regex_engine/commit/420f70a46a0ace335d2be8631b2c372022b8f2f2) **Date: 10-07-2024 | New:** Introduced a unified `#Group` token that now encapsulates the group modifier and sub-expression, streamlining group handling in both the lexer and the parser. +- **Capture Index Tracking** + - Implemented capture group index tracking for pre-matching optimization. ## 🐛 Bug Fixes @@ -27,26 +29,31 @@ ## 🔄 Changes - **Reduced Token Count in Lexer** - - Reduced the number of tokens in the lexer by eliminating `#GroupStart` and `#GroupEnd` in favor of a single `#Group` token, simplifying group handling during the parsing process. + - [#420f70a](https://github.com/Demali-876/motoko_regex_engine/commit/420f70a46a0ace335d2be8631b2c372022b8f2f2) **Date: 10-07-2024 | Improved:**Reduced the number of tokens in the lexer by eliminating `#GroupStart` and `#GroupEnd` in favor of a single `#Group` token, simplifying group handling during the parsing process. - **Unified Group Token** - - Removed the standalone `GroupModifierType` token, integrating it into the `#Group` token type. + - [#420f70a](https://github.com/Demali-876/motoko_regex_engine/commit/420f70a46a0ace335d2be8631b2c372022b8f2f2) **Date: 10-07-2024 | Improved :** Removed the standalone `GroupModifierType` token, integrating it into the `#Group` token type. - **NextToken() Improvements** - - Removed redundant cases. + - [#420f70a](https://github.com/Demali-876/motoko_regex_engine/commit/420f70a46a0ace335d2be8631b2c372022b8f2f2) **Date: 10-07-2024 | Improved:** Removed redundant cases. +- **Complete Parser and Compiler Redesign** + - Optimized NFA generation using Thompson's Construction Algorithm. + - Complete Parser rework using new ast structure. + - Introduction of Parser Errors. Parser now produces an AST or an Error. ## ❌ Removed - [#6279c34](https://github.com/Demali-876/motoko_regex_engine/commit/6279c34557a50328ac43555533fbf5708f867679) **Date: 10-03-2024 | Removed:** Removed `#QuantifierRange` token. All ranges are now handled by the `#Quantifier` token. +- []() Eliminated Redundant Tokens --- ### **To-Do Checklist** -- [ ] **Parser Overhaul** - - [ ] Adapt the parser to utilize the new flattened AST structure. - - [ ] Ensure proper handling of the unified `#Group` token, including its modifiers and sub-expression references. +- [x] **Parser Overhaul | Status: Completed Date: 10-08-2024** Commit: + - [x] Adapt the parser to utilize the new flattened AST structure. + - [x] Ensure proper handling of the unified `#Group` token, including its modifiers and sub-expression references. -- [ ] **NFA Construction** - - [ ] Refactor NFA construction to take advantage of the flattened AST for incremental optimization. +- [x] **NFA Construction | Status: Started Date: 10-08-2024** Commit: + - [x] Refactor NFA construction to take advantage of the flattened AST for incremental optimization. - [ ] Implement state reduction and bisimulation in the NFA to prevent state explosion. - [ ] **Incremental Optimization** diff --git a/src/motoko_regex/Compiler.mo b/src/motoko_regex/Compiler.mo index e96aefa..a7f706f 100644 --- a/src/motoko_regex/Compiler.mo +++ b/src/motoko_regex/Compiler.mo @@ -8,291 +8,355 @@ import Extensions "Extensions"; import Optimizer "Optimizer"; module { - /* public class Compiler() { - private var nextState : Types.State = 0; - private var captureGroups = Buffer.Buffer(8); - - public func compile(ast : Types.AST) : Types.CompiledRegex { - let transitions = Buffer.Buffer<(Types.State, Types.Transition, Types.State)>(16); - let (start, end) = switch (ast) { - case (#node(node)) compileNode(node, transitions); - }; - let optimizer = Optimizer.Optimizer(); - optimizer.optimize( - { - transitions = Buffer.toArray(transitions); - startState = start; - acceptStates = [end]; - captureGroups = Buffer.toArray(captureGroups); - }); - }; + public class Compiler() { + type State = Types.State; + type NFA = Types.CompiledRegex; + type Transition = Types.Transition; + type TransitionTable = Types.TransitionTable; - private func compileNode(node : Types.ASTNode, transitions : Buffer.Buffer<(Types.State, Types.Transition, Types.State)>) : (Types.State, Types.State) { - switch (node) { - case (#Character(char)) compileCharacter(char, transitions); - case (#Concatenation(left, right)) compileConcatenation(left, right, transitions); - case (#Alternation(left, right)) compileAlternation(left, right, transitions); - case (#Quantifier(quantType, subExpr)) compileQuantifier(quantType, subExpr, transitions); - case (#Group(subExpr)) compileGroup(subExpr, transitions); - case (#CharacterClass(isNegated, classes,)) compileCharacterClass(isNegated, classes, transitions); - case (#Anchor(anchorType)) compileAnchor(anchorType, transitions); - case (#Metacharacter(metaType)) compileMetacharacter(metaType, transitions); + public func compile(ast: Types.ASTNode): NFA { + let startState: State = 0; + let (transitionTable, acceptStates) = buildNFA(ast, startState); + { + transitions = transitionTable; + startState = startState; + acceptStates = acceptStates; } }; - private func compileCharacter(char : Char, transitions : Buffer.Buffer<(Types.State, Types.Transition, Types.State)>) : (Types.State, Types.State) { - let start = nextState; - nextState += 1; - let end = nextState; - nextState += 1; - transitions.add((start, #Char(char), end)); - (start, end) - }; + public func buildNFA(ast: Types.ASTNode, startState: State): (TransitionTable, [State]) { + switch (ast) { - private func compileConcatenation(left : Types.AST, right : Types.AST, transitions : Buffer.Buffer<(Types.State, Types.Transition, Types.State)>) : (Types.State, Types.State) { - let (leftStart, leftEnd) = switch (left) { - case (#node(node)) compileNode(node, transitions); - }; - let (rightStart, rightEnd) = switch (right) { - case (#node(node)) compileNode(node, transitions); + case (#Character(char)) { + let acceptState: State = startState + 1; + let transition: Transition = #Char(char); + let transitionTable: TransitionTable = [(startState, transition, acceptState)]; + (transitionTable, [acceptState]); }; - transitions.add((leftEnd, #Epsilon, rightStart)); - (leftStart, rightEnd) - }; - private func compileAlternation(left : Types.AST, right : Types.AST, transitions : Buffer.Buffer<(Types.State, Types.Transition, Types.State)>) : (Types.State, Types.State) { - let start = nextState; - nextState += 1; - let (leftStart, leftEnd) = switch (left) { - case (#node(node)) compileNode(node, transitions); - }; - let (rightStart, rightEnd) = switch (right) { - case (#node(node)) compileNode(node, transitions); + case (#Concatenation(subExprs)) { + var currentStartState: State = startState; + let transitionBuffer = Buffer.Buffer<(State, Transition, State)>(subExprs.size()); + var acceptStates: [State] = []; + + for (subExpr in subExprs.vals()) { + let (subTransitionTable, subAcceptStates) = buildNFA(subExpr, currentStartState); + for ((fromState, transition, toState) in subTransitionTable.vals()) { + transitionBuffer.add((fromState, transition, toState)); + }; + currentStartState := subAcceptStates[0]; + acceptStates := subAcceptStates; + }; + + (Buffer.toArray(transitionBuffer), acceptStates); }; - let end = nextState; - nextState += 1; - transitions.add((start, #Epsilon, leftStart)); - transitions.add((start, #Epsilon, rightStart)); - transitions.add((leftEnd, #Epsilon, end)); - transitions.add((rightEnd, #Epsilon, end)); - (start, end) - }; - private func compileQuantifier(quantType: Types.QuantifierType, subExpr: Types.AST, transitions: Buffer.Buffer<(Types.State, Types.Transition, Types.State)>): (Types.State, Types.State) { - let (subStart, subEnd) = switch (subExpr) { - case (#node(node)) compileNode(node, transitions); - }; + case (#Alternation(subExprs)) { + let newStartState: State = startState; + let newAcceptState: State = newStartState + subExprs.size() + 1; + let transitionBuffer = Buffer.Buffer<(State, Transition, State)>(subExprs.size() * 2); + var acceptStates: [State] = [newAcceptState]; - let start = nextState; - nextState += 1; - let end = nextState; - nextState += 1; - - // Destructure the quantifier type - let {min; max; mode} = quantType; - - // Handle 'min' repetitions - var currentState = start; - for (_ in Iter.range(0, min - 1)) { - let nextState = getNextState(); - transitions.add((currentState, #Epsilon, subStart)); - transitions.add((subEnd, #Epsilon, nextState)); - currentState := nextState; - }; - // Handle additional repetitions based on the quantifier mode and max value - switch (max) { - case (null) { // Infinite upper bound - switch (mode) { - case (#Greedy) { - transitions.add((currentState, #Epsilon, subStart)); - transitions.add((subEnd, #Epsilon, currentState)); - transitions.add((currentState, #Epsilon, end)); - }; - case (#Lazy) { - transitions.add((currentState, #Epsilon, end)); - transitions.add((currentState, #Epsilon, subStart)); - transitions.add((subEnd, #Epsilon, currentState)); - }; - case (#Possessive) { - transitions.add((currentState, #Epsilon, subStart)); - transitions.add((subEnd, #Epsilon, currentState)); - }; + for (subExpr in subExprs.vals()) { + let (subTransitionTable, subAcceptStates) = buildNFA(subExpr, newStartState + 1); + transitionBuffer.add((newStartState, #Epsilon, subTransitionTable[0].0)); + + for ((fromState, transition, toState) in subTransitionTable.vals()) { + transitionBuffer.add((fromState, transition, toState)); }; - }; - case (?maxVal) { - if (maxVal > min) { - // Add optional repetitions based on the mode - for (_ in Iter.range(0, maxVal - min - 1)) { - let nextState = getNextState(); - switch (mode) { - case (#Greedy) { - transitions.add((currentState, #Epsilon, subStart)); - transitions.add((currentState, #Epsilon, nextState)); - transitions.add((subEnd, #Epsilon, nextState)); - }; - case (#Lazy) { - transitions.add((currentState, #Epsilon, nextState)); - transitions.add((currentState, #Epsilon, subStart)); - transitions.add((subEnd, #Epsilon, nextState)); - }; - case (#Possessive) { - transitions.add((currentState, #Epsilon, subStart)); - transitions.add((subEnd, #Epsilon, nextState)); - }; - }; - currentState := nextState; - }; + + for (acceptState in subAcceptStates.vals()) { + transitionBuffer.add((acceptState, #Epsilon, newAcceptState)); }; - transitions.add((currentState, #Epsilon, end)); - }; - }; + }; - (start, end) -}; + (Buffer.toArray(transitionBuffer), acceptStates); + }; - private func getNextState() : Types.State { - let state = nextState; - nextState += 1; - state - }; + case (#Quantifier { subExpr; min; max; mode }) { + let transitionBuffer = Buffer.Buffer<(State, Transition, State)>(10); + let (subTransitionTable, subAcceptStates) = buildNFA(subExpr, startState + 1); + let quantifierStartState: State = startState; + let quantifierAcceptState: State = startState + subTransitionTable.size() + 2; + // Assign default max value (100) if max is null + let maxVal = switch (max) { + case (null) 100; + case (?value) value; + }; - private func compileGroup(subExpr : Types.AST, transitions : Buffer.Buffer<(Types.State, Types.Transition, Types.State)>) : (Types.State, Types.State) { - let start = nextState; - nextState += 1; - let (subStart, subEnd) = switch (subExpr) { - case (#node(node)) compileNode(node, transitions); + for ((fromState, transition, toState) in subTransitionTable.vals()) { + transitionBuffer.add((fromState, transition, toState)); }; - let end = nextState; - nextState += 1; - transitions.add((start, #Epsilon, subStart)); - transitions.add((subEnd, #Epsilon, end)); - captureGroups.add({ startState = start; endState = end }); - (start, end) - }; - private func compileCharacterClass(isNegated : Bool, classes : [Types.CharacterClass], transitions : Buffer.Buffer<(Types.State, Types.Transition, Types.State)>) : (Types.State, Types.State) { - let start = nextState; - nextState += 1; - let end = nextState; - nextState += 1; - let ranges = Buffer.Buffer<(Char, Char)>(classes.size()); - - for (c in classes.vals()) { - switch (c) { - case (#Single(char)) { - ranges.add((char, char)); + if (min == 0 and max == null) { + switch (mode) { + case (#Greedy) { + transitionBuffer.add((quantifierStartState, #Epsilon, subTransitionTable[0].0)); + for (acceptState in subAcceptStates.vals()) { + transitionBuffer.add((acceptState, #Epsilon, quantifierAcceptState)); // Exit loop + transitionBuffer.add((acceptState, #Epsilon, quantifierStartState)); // Loop back + }; + }; + case (#Lazy) { + transitionBuffer.add((quantifierStartState, #Epsilon, quantifierAcceptState)); // Match 0 times first + transitionBuffer.add((quantifierStartState, #Epsilon, subTransitionTable[0].0)); // Try to match sub-expression + for (acceptState in subAcceptStates.vals()) { + transitionBuffer.add((acceptState, #Epsilon, quantifierStartState)); // Loop back for more matches + }; + }; + case (#Possessive) { + transitionBuffer.add((quantifierStartState, #Epsilon, subTransitionTable[0].0)); + for (acceptState in subAcceptStates.vals()) { + transitionBuffer.add((acceptState, #Epsilon, quantifierAcceptState)); // Exit loop, no backtracking + }; + }; }; - case (#Range(from, to)) { - ranges.add((from, to)); + } else if (min == 1 and max == null) { + switch (mode) { + case (#Greedy) { + transitionBuffer.add((quantifierStartState, #Epsilon, subTransitionTable[0].0)); + for (acceptState in subAcceptStates.vals()) { + transitionBuffer.add((acceptState, #Epsilon, quantifierAcceptState)); + transitionBuffer.add((acceptState, #Epsilon, quantifierStartState)); + }; + }; + case (#Lazy) { + transitionBuffer.add((quantifierStartState, #Epsilon, subTransitionTable[0].0)); + transitionBuffer.add((quantifierStartState, #Epsilon, quantifierAcceptState)); // Try to match 0 times + for (acceptState in subAcceptStates.vals()) { + transitionBuffer.add((acceptState, #Epsilon, quantifierStartState)); + }; + }; + case (#Possessive) { + transitionBuffer.add((quantifierStartState, #Epsilon, subTransitionTable[0].0)); + for (acceptState in subAcceptStates.vals()) { + transitionBuffer.add((acceptState, #Epsilon, quantifierAcceptState)); + }; + }; }; - case (#Metacharacter(metaType)) { - let metaRanges = Extensions.metacharToRanges(metaType); - for (range in metaRanges.vals()) { - ranges.add(range); + } else if (min == 0 and max == ?1) { + switch (mode) { + case (#Greedy) { + transitionBuffer.add((quantifierStartState, #Epsilon, subTransitionTable[0].0)); + for (acceptState in subAcceptStates.vals()) { + transitionBuffer.add((acceptState, #Epsilon, quantifierAcceptState)); + }; + }; + case (#Lazy) { + transitionBuffer.add((quantifierStartState, #Epsilon, quantifierAcceptState)); // Match 0 times first + transitionBuffer.add((quantifierStartState, #Epsilon, subTransitionTable[0].0)); // Try to match 1 time + }; + case (#Possessive) { + transitionBuffer.add((quantifierStartState, #Epsilon, subTransitionTable[0].0)); + for (acceptState in subAcceptStates.vals()) { + transitionBuffer.add((acceptState, #Epsilon, quantifierAcceptState)); // Exit, no backtracking + }; }; }; - case (#Quantified(charClass, quantType)) { - // Handle quantifiers directly by adding transitions, no need to add to ranges. - let _ = compileQuantifier(quantType, #node(#CharacterClass(isNegated, [charClass])), transitions); + } else if (min > 0 and max != null) { + var currentStartState = quantifierStartState; + for (i in Iter.range(0, min - 1)) { + let (subTrans, subAcc) = buildNFA(subExpr, currentStartState + 1); + for ((fromState, transition, toState) in subTrans.vals()) { + transitionBuffer.add((fromState, transition, toState)); + }; + currentStartState := subAcc[0]; }; - }; - }; - // Sort and merge overlapping ranges - let sortedRanges : [(Char, Char)] = Buffer.toArray(ranges); - ignore Array.sort<(Char, Char)>(sortedRanges, func(a : (Char, Char), b : (Char, Char)) : Order.Order { - Char.compare(a.0, b.0) - }); - let mergedRanges = Buffer.Buffer<(Char, Char)>(sortedRanges.size()); - for (range in sortedRanges.vals()) { - switch (mergedRanges.removeLast()) { - case (null) { - mergedRanges.add(range); - }; - case (?lastRange) { - if (Char.toNat32(range.0) <= Char.toNat32(lastRange.1) + 1) { - mergedRanges.add((lastRange.0, Extensions.maxChar(lastRange.1, range.1))); - } else { - mergedRanges.add(lastRange); - mergedRanges.add(range); + for (i in Iter.range(min, maxVal - 1)) { + let (subTrans, subAcc) = buildNFA(subExpr, currentStartState + 1); + for ((fromState, transition, toState) in subTrans.vals()) { + transitionBuffer.add((fromState, transition, toState)); }; + currentStartState := subAcc[0]; }; }; - }; - // Add transitions based on merged ranges - if (isNegated) { - var lastChar : Char = Char.fromNat32(0); - for (range in mergedRanges.vals()) { - if (Char.toNat32(lastChar) < Char.toNat32(range.0)) { - transitions.add((start, #Range(lastChar, Char.fromNat32(Char.toNat32(range.0) - 1)), end)); - }; - lastChar := Char.fromNat32(Char.toNat32(range.1) + 1); - }; - if (Char.toNat32(lastChar) <= 255) { - transitions.add((start, #Range(lastChar, Char.fromNat32(255)), end)); - }; - } else { - for (range in mergedRanges.vals()) { - transitions.add((start, #Range(range.0, range.1), end)); - }; + (Buffer.toArray(transitionBuffer), [quantifierAcceptState]); }; - (start, end) - }; + case (#Group { subExpr; captureIndex; modifier }) { + let transitionBuffer = Buffer.Buffer<(State, Transition, State)>(10); + let groupStartState: State = startState; + let groupEndState: State = groupStartState + 1; - private func compileAnchor(anchorType : Types.AnchorType, transitions : Buffer.Buffer<(Types.State, Types.Transition, Types.State)>) : (Types.State, Types.State) { - let start = nextState; - nextState += 1; - let end = nextState; - nextState += 1; + switch (modifier) { + case null { + let (subTransitionTable, subAcceptStates) = buildNFA(subExpr, groupStartState + 1); - switch (anchorType) { - case (#StartOfString) { - transitions.add((start, #Epsilon, end)); - }; - case (#EndOfString) { - transitions.add((start, #Epsilon, end)); - }; - case (#WordBoundary) { - transitions.add((start, #Epsilon, end)); - }; - case (#NonWordBoundary) { - transitions.add((start, #Epsilon, end)); - }; - case (#StartOfStringOnly) { - transitions.add((start, #Epsilon, end)); - }; - case (#EndOfStringOnly) { - transitions.add((start, #Epsilon, end)); + for ((fromState, transition, toState) in subTransitionTable.vals()) { + transitionBuffer.add((fromState, transition, toState)); + }; + + for (acceptState in subAcceptStates.vals()) { + transitionBuffer.add((acceptState, #Epsilon, groupEndState)); + }; + + if (captureIndex != null) { + transitionBuffer.add((groupStartState, #Group {startState = groupStartState; endState = groupEndState; captureIndex}, groupEndState)); + }; + + (Buffer.toArray(transitionBuffer), [groupEndState]); + }; + + case (?#NonCapturing) { + let (subTransitionTable, subAcceptStates) = buildNFA(subExpr, groupStartState + 1); + + for ((fromState, transition, toState) in subTransitionTable.vals()) { + transitionBuffer.add((fromState, transition, toState)); + }; + + for (acceptState in subAcceptStates.vals()) { + transitionBuffer.add((acceptState, #Epsilon, groupEndState)); + }; + + (Buffer.toArray(transitionBuffer), [groupEndState]); + }; + + case (?#PositiveLookahead) { + let (subTransitionTable, _) = buildNFA(subExpr, groupStartState); + + for ((fromState, transition, toState) in subTransitionTable.vals()) { + transitionBuffer.add((fromState, transition, toState)); + }; + + (Buffer.toArray(transitionBuffer), [groupStartState]); + }; + + case (?#NegativeLookahead) { + let (subTransitionTable, _) = buildNFA(subExpr, groupStartState); + + for ((fromState, transition, toState) in subTransitionTable.vals()) { + transitionBuffer.add((fromState, transition, toState)); + }; + + (Buffer.toArray(transitionBuffer), [groupStartState]); // Group ends without advancing + }; + + case (?#PositiveLookbehind) { + let (subTransitionTable, _) = buildNFA(subExpr, groupStartState); + + for ((fromState, transition, toState) in subTransitionTable.vals()) { + transitionBuffer.add((fromState, transition, toState)); + }; + + (Buffer.toArray(transitionBuffer), [groupStartState]); + }; + + case (?#NegativeLookbehind) { + let (subTransitionTable, _) = buildNFA(subExpr, groupStartState); + + for ((fromState, transition, toState) in subTransitionTable.vals()) { + transitionBuffer.add((fromState, transition, toState)); + }; + + (Buffer.toArray(transitionBuffer), [groupStartState]); + }; + }; }; - case (#PreviousMatchEnd) { - transitions.add((start, #Epsilon, end)); + + case (#Metacharacter metacharType) { + let transitionBuffer = Buffer.Buffer<(State, Transition, State)>(1); + let acceptState: State = startState + 1; + switch (metacharType) { + case (#Dot) { + transitionBuffer.add((startState, #Any, acceptState)); + }; + case (_) { + let metaRanges = Extensions.metacharToRanges(metacharType); + for (range in metaRanges.vals()) { + transitionBuffer.add((startState, #Range(range.0, range.1), acceptState)); + }; + }; + }; + (Buffer.toArray(transitionBuffer), [acceptState]); }; - }; - (start, end) - }; + case (#CharacterClass { isNegated; classes }) { + let transitionBuffer = Buffer.Buffer<(State, Transition, State)>(classes.size()); + let acceptState: State = startState + 1; + let ranges = Buffer.Buffer<(Char, Char)>(classes.size()); + for (charClass in classes.vals()) { + switch (charClass) { + case (#Single(char)) { + ranges.add((char, char)); + }; + case (#Range(from, to)) { + ranges.add((from, to)); + }; + case (#Metacharacter(metaType)) { + let metaRanges = Extensions.metacharToRanges(metaType); + for (range in metaRanges.vals()) { + ranges.add(range); + }; + }; + case (#Quantified(charClass, quantType)) { + ignore buildNFA(#Quantifier { + subExpr = #CharacterClass({isNegated = isNegated; classes = [charClass]}); + min = quantType.min; + max = quantType.max; + mode = quantType.mode; + }, startState); + }; + + }; + }; + + if (isNegated) { + var lastChar: Char = Char.fromNat32(0); + let sortedRanges = Buffer.toArray(ranges); + ignore Array.sort<(Char, Char)>(sortedRanges, func(a: (Char, Char), b: (Char, Char)) : Order.Order { + Char.compare(a.0, b.0) + }); + for (range in sortedRanges.vals()) { + if (Char.toNat32(lastChar) < Char.toNat32(range.0)) { + transitionBuffer.add((startState, #Range(lastChar, Char.fromNat32(Char.toNat32(range.0) - 1)), acceptState)); + }; + lastChar := Char.fromNat32(Char.toNat32(range.1) + 1); + }; + if (Char.toNat32(lastChar) <= 255) { + transitionBuffer.add((startState, #Range(lastChar, Char.fromNat32(255)), acceptState)); + }; + } else { + for (range in ranges.vals()) { + transitionBuffer.add((startState, #Range(range.0, range.1), acceptState)); + }; + }; - private func compileMetacharacter(metaType : Types.MetacharacterType, transitions : Buffer.Buffer<(Types.State, Types.Transition, Types.State)>) : (Types.State, Types.State) { - let start = nextState; - nextState += 1; - let end = nextState; - nextState += 1; - switch (metaType) { - case (#Dot) { - transitions.add((start, #Any, end)); + (Buffer.toArray(transitionBuffer), [acceptState]); }; - case (_) { - let metaRanges = Extensions.metacharToRanges(metaType); - for (range in metaRanges.vals()) { - transitions.add((start, #Range(range.0, range.1), end)); - } + + case (#Anchor anchorType) { + let transitionBuffer = Buffer.Buffer<(State, Transition, State)>(1); + let acceptState: State = startState + 1; + switch (anchorType) { + case (#StartOfString) { + transitionBuffer.add((startState, #Epsilon, acceptState)); + }; + case (#EndOfString) { + transitionBuffer.add((startState, #Epsilon, acceptState)); + }; + case (#WordBoundary) { + transitionBuffer.add((startState, #Epsilon, acceptState)); + }; + case (#NonWordBoundary) { + transitionBuffer.add((startState, #Epsilon, acceptState)); + }; + case (#StartOfStringOnly) { + transitionBuffer.add((startState, #Epsilon, acceptState)); + }; + case (#EndOfStringOnly) { + transitionBuffer.add((startState, #Epsilon, acceptState)); + }; + case (#PreviousMatchEnd) { + transitionBuffer.add((startState, #Epsilon, acceptState)); + }; + }; + (Buffer.toArray(transitionBuffer), [acceptState]); }; - }; - (start, end) } - };*/ -} \ No newline at end of file + }; + }; +}; diff --git a/src/motoko_regex/Extensions.mo b/src/motoko_regex/Extensions.mo index 2e537ec..5b75f1d 100644 --- a/src/motoko_regex/Extensions.mo +++ b/src/motoko_regex/Extensions.mo @@ -150,8 +150,8 @@ module{ Array.tabulate(end - start, func(i:Nat) { arr[start + i] }) }; //Error handling - public func errorToText(error : Types.LexerError) : Text { - switch (error) { + public func errorToText(error: Types.RegexError): Text { + switch (error) { case (#GenericError(text)) text; case (#InvalidEscapeSequence(char)) "Invalid escape sequence: " # Char.toText(char); case (#InvalidQuantifierRange(text)) "Invalid quantifier range: " # text; @@ -159,8 +159,12 @@ module{ case (#UnexpectedCharacter(char)) "Unexpected character: " # Char.toText(char); case (#UnexpectedEndOfInput) "Unexpected end of input"; case (#UnmatchedParenthesis(char)) "Unmatched parenthesis: " # Char.toText(char); - }; + case (#UnexpectedToken(tokenType)) "Unexpected token: " # debug_show(tokenType); + case (#UnclosedGroup(text)) "Unclosed group: " # text; + case (#InvalidQuantifier(text)) "Invalid quantifier: " # text; + }; }; + //replace an element of a buffer given an index public func replace(buffer: Buffer.Buffer, index: Nat, newElement: T) { if (index >= buffer.size()) { diff --git a/src/motoko_regex/Lexer.mo b/src/motoko_regex/Lexer.mo index e6fda89..5a15fd0 100644 --- a/src/motoko_regex/Lexer.mo +++ b/src/motoko_regex/Lexer.mo @@ -11,7 +11,7 @@ import Cursor "Cursor"; module { type Token = Types.Token; - type LexerError = Types.LexerError; + type LexerError = Types.RegexError; type CharacterClass = Types.CharacterClass; public class Lexer(input: Text) { let cursor = Cursor.Cursor(input); @@ -194,7 +194,7 @@ module { if (not cursor.hasNext()) { return #err(#GenericError("Unexpected end of input at position " # Nat.toText(start))); }; - cursor.inc(); + cursor.inc(); // Consume the opening parenthesis let groupModifierResult = parseGroupModifier(); var groupModifier: ?Types.GroupModifierType = null; @@ -218,7 +218,7 @@ module { return #err(#GenericError("Expected closing parenthesis at position " # Nat.toText(cursor.getPos()) # ", found '" # Text.fromChar(cursor.current()) # "'")); }; - cursor.inc(); + cursor.inc(); // Consume the closing parenthesis let groupToken: Token = { tokenType = #Group({ @@ -226,7 +226,7 @@ module { subTokens = subTokens; }); value = Extensions.slice(input, start, ?cursor.getPos()); - position = #Span(start, cursor.getPos() -1); + position = #Span(start, cursor.getPos() - 1); }; #ok(groupToken) }; @@ -269,9 +269,10 @@ module { while (cursor.hasNext() and depth > 0) { switch (cursor.current()) { case '(' { - depth += 1; switch (tokenizeGroup()) { - case (#ok(token)) { subTokens.add(token) }; + case (#ok(token)) { + subTokens.add(token); + }; case (#err(error)) { return #err(error) }; }; }; @@ -280,7 +281,7 @@ module { if (depth == 0) { return #ok(subTokens); } else { - cursor.inc(); + return #ok(subTokens); }; }; case _ { @@ -297,7 +298,7 @@ module { }; #ok(subTokens) -}; + }; private func tokenizeEscapedChar(): Result.Result { cursor.inc(); diff --git a/src/motoko_regex/Optimizer.mo b/src/motoko_regex/Optimizer.mo index 4527c26..ad96f6b 100644 --- a/src/motoko_regex/Optimizer.mo +++ b/src/motoko_regex/Optimizer.mo @@ -5,7 +5,7 @@ import Hash "mo:base/Hash"; import Iter "mo:base/Iter"; import Nat "mo:base/Nat"; -module { +/* module { public class Optimizer() { private var epsilonClosureCache = HashMap.HashMap>(10, Nat.equal, Hash.hash); @@ -134,5 +134,5 @@ module { stateTransitionsMap }; }; -} +} */ diff --git a/src/motoko_regex/Parser.mo b/src/motoko_regex/Parser.mo index eb12ec5..3fe3f17 100644 --- a/src/motoko_regex/Parser.mo +++ b/src/motoko_regex/Parser.mo @@ -1,16 +1,21 @@ -import Debug "mo:base/Debug"; import Nat "mo:base/Nat"; import Types "Types"; +import Result "mo:base/Result"; module { - public class Parser(tokens: [Types.Token]) { - var cursor: Nat = 0; + public type ParserError = Types.RegexError; + public type Token = Types.Token; + public type AST = Types.AST; - public func parse(): ?Types.AST { + public class Parser(initialTokens: [Token]) { + var tokens = initialTokens; + var cursor : Nat = 0; + var captureGroupIndex = 1; + public func parse(): Result.Result { parseAlternation() }; - private func parseAlternation(): ?Types.AST { + private func parseAlternation(): Result.Result { var left = parseConcatenation(); label l while (cursor < tokens.size()) { switch (peekToken()) { @@ -18,16 +23,19 @@ module { if (token.tokenType == #Alternation) { ignore advanceCursor(); switch (parseConcatenation()) { - case (?right) { + case (#ok(right)) { switch (left) { - case (?l) { - left := ?#node(#Alternation(l, right)); - + case (#ok(l)) { + left := #ok(#Alternation([l, right])); + }; + case (#err(error)) { + return #err(error); }; - case null { break l; }; }; }; - case (null) { break l; }; + case (#err(error)) { + return #err(error); + }; }; } else { break l; }; }; @@ -37,22 +45,26 @@ module { left }; - private func parseConcatenation(): ?Types.AST { + private func parseConcatenation(): Result.Result { var left = parseQuantifier(); label l while (cursor < tokens.size()) { switch (peekToken()) { case (?token) { - if (token.tokenType != #Alternation and token.tokenType != #GroupEnd) { + if (token.tokenType != #Alternation) { switch (parseQuantifier()) { - case (?right) { + case (#ok(right)) { switch (left) { - case (?l) { - left := ?#node(#Concatenation(l, right)); + case (#ok(l)) { + left := #ok(#Concatenation([l, right])); + }; + case (#err(error)) { + return #err(error); }; - case null { break l; }; }; }; - case (null) { break l; }; + case (#err(error)) { + return #err(error); + }; }; } else { break l; }; }; @@ -62,88 +74,121 @@ module { left }; - private func parseQuantifier(): ?Types.AST { + private func parseQuantifier(): Result.Result { var node = parsePrimary(); if (cursor < tokens.size()) { - switch (peekToken()) { - case (?token) { - switch (token.tokenType) { - case (#Quantifier(quantType)) { - ignore advanceCursor(); - switch (node) { - case (?n) { - node := ?#node(#Quantifier(quantType, n)); - }; - case null {}; - }; - }; - case (_) {}; + switch (peekToken()) { + case (?token) { + switch (token.tokenType) { + case (#Quantifier(quantType)) { + ignore advanceCursor(); + switch (node) { + case (#ok(n)) { + node := #ok(#Quantifier({ + subExpr = n; + min = quantType.min; + max = quantType.max; + mode = quantType.mode; + })); + }; + case (#err(error)) { + return #err(error); }; + }; }; - case (null) {}; + case (_) {}; + }; }; + case (null) {}; + }; }; node - }; + }; - private func parsePrimary(): ?Types.AST { + private func parsePrimary(): Result.Result { switch (advanceCursor()) { case (?token) { switch (token.tokenType) { case (#Character(char)) { - ?#node(#Character(char)); + #ok(#Character(char)); }; - case (#GroupStart) { - switch (parseAlternation()) { - case (?groupNode) { - switch (expectToken(#GroupEnd)) { - case (?_) { - ?#node(#Group(groupNode)); + case (#Group(groupData)) { + // Determine if this is a capturing group + let isCapturing = switch (groupData.modifier) { + case (?#NonCapturing) { false }; + case (_) { true }; + }; + // Assign capture index for capturing groups + let currentCaptureIndex = if (isCapturing) { + let index = captureGroupIndex; + captureGroupIndex += 1; // Increment for the next capturing group + ?index; + } else { + null; + }; + + switch (parseGroup(groupData)) { + case (#ok(groupNode)) { + #ok(#Group({ + subExpr = groupNode; + modifier = switch (groupData.modifier) { + case (?mod) { ?mod }; + case (null) { + if (isCapturing) { null } else { ?#NonCapturing }; + }; }; - case null { null }; - }; + captureIndex = currentCaptureIndex; + })); + }; + case (#err(error)) { + #err(error); }; - case (null) { null }; }; }; case (#CharacterClass(isNegated, classes)) { - ?#node (#CharacterClass(isNegated, classes)); + #ok(#CharacterClass({ + isNegated = isNegated; + classes = classes; + })); }; case (#Anchor(anchorType)) { - ?#node(#Anchor(anchorType)); + #ok(#Anchor(anchorType)); }; case (#Metacharacter(metaType)) { - ?#node (#Metacharacter(metaType)); + #ok(#Metacharacter(metaType)); }; case (_) { - Debug.print("Unexpected token: " # debug_show(token.tokenType)); - null + #err(#GenericError("Unexpected token: " # debug_show(token.tokenType))); }; }; }; - case (null) { null }; - } - }; - - private func expectToken(expectedType: Types.TokenType): ?Types.Token { - switch (advanceCursor()) { - case (?token) { - if (token.tokenType != expectedType) { - Debug.print("Expected " # debug_show(expectedType) # " but found " # debug_show(token.tokenType)); - null - } else { - ?token - }; - }; case (null) { - Debug.print("Unexpected end of input"); - null + #err(#UnexpectedEndOfInput); }; - } + }; + }; + + private func parseGroup(groupData: {modifier: ?Types.GroupModifierType; subTokens: [Token]}): Result.Result { + // Save current state + let savedTokens = tokens; + let savedCursor = cursor; + + // Set new state for parsing the group + tokens := groupData.subTokens; + cursor := 0; + + // Parse the group + let result = parseAlternation(); + + // Restore previous state + tokens := savedTokens; + cursor := savedCursor; + + result }; - private func peekToken(): ?Types.Token { + private func peekToken(): ?Token { if (cursor < tokens.size()) { ?tokens[cursor] } else { @@ -151,7 +196,7 @@ module { } }; - private func advanceCursor(): ?Types.Token { + private func advanceCursor(): ?Token { if (cursor < tokens.size()) { let token = tokens[cursor]; cursor += 1; diff --git a/src/motoko_regex/Regex.mo b/src/motoko_regex/Regex.mo index 529d8a1..884fc94 100644 --- a/src/motoko_regex/Regex.mo +++ b/src/motoko_regex/Regex.mo @@ -1,7 +1,9 @@ import Lexer "Lexer"; import Parser "Parser"; import Compiler "Compiler"; +import Extensions "Extensions"; import Types "Types"; +import Debug "mo:base/Debug"; actor { @@ -10,32 +12,43 @@ actor { lexer.tokenize(); }; - /*public query func testParser(t: Text): async ?Types.AST { - let lexer = Lexer.Lexer(t); - let tokenResult = lexer.tokenize(); - let parser = Parser.Parser(tokenResult); - parser.parse(); - }; + public query func testParser(t: Text): async ?Types.AST { + let lexer = Lexer.Lexer(t); + let tokenResult = lexer.tokenize(); + let parser = Parser.Parser(tokenResult); + + switch (parser.parse()) { + case (#ok(ast)) { + ?ast + }; + case (#err(error)) { + Debug.print("Parser error: " # Extensions.errorToText(error)); + null + }; + } +}; public query func testCompiler(t: Text): async Types.CompiledRegex { - let lexer = Lexer.Lexer(t); - let tokenResult = lexer.tokenize(); - let parser = Parser.Parser(tokenResult); - let astResult = parser.parse(); - switch (astResult) { - case (?ast) { - let compiler = Compiler.Compiler(); - compiler.compile(ast); - }; - case null { - { - transitions = []; - startState = 0; - acceptStates = []; - captureGroups = []; - }; + let lexer = Lexer.Lexer(t); + let tokenResult = lexer.tokenize(); + let parser = Parser.Parser(tokenResult); + let astResult = parser.parse(); + + switch (astResult) { + case (#ok(ast)) { + let compiler = Compiler.Compiler(); + compiler.compile(ast); + }; + case (#err(error)) { + Debug.print("Compiler error: " # Extensions.errorToText(error)); + { + transitions = []; + startState = 0; + acceptStates = []; }; }; - };*/ + } +}; + }; diff --git a/src/motoko_regex/Types.mo b/src/motoko_regex/Types.mo index d5196dd..8433216 100644 --- a/src/motoko_regex/Types.mo +++ b/src/motoko_regex/Types.mo @@ -1,9 +1,7 @@ import Text "mo:base/Text"; import Char "mo:base/Char"; module{ - public type AST = { - node: ASTNode; - }; + public type AST = ASTNode; public type ASTNode = { #Character : Char; #Concatenation : [AST]; @@ -16,7 +14,7 @@ module{ }; #Group : { subExpr: AST; - modifier: GroupModifierType; + modifier: ?GroupModifierType; captureIndex: ?Nat; }; #Metacharacter : MetacharacterType; @@ -95,35 +93,40 @@ public type QuantifierType = { position : Position; }; - public type LexerError = { - #UnexpectedCharacter : Char; - #UnexpectedEndOfInput; - #GenericError :Text; - #InvalidQuantifierRange : Text; - #InvalidEscapeSequence : Char; - #UnmatchedParenthesis : Char; - #MismatchedParenthesis : (Char, Char); - }; + public type RegexError = { + #UnexpectedCharacter: Char; + #UnexpectedEndOfInput; + #GenericError: Text; + #InvalidQuantifierRange: Text; + #InvalidEscapeSequence: Char; + #UnmatchedParenthesis: Char; + #MismatchedParenthesis: (Char, Char); + #UnexpectedToken: TokenType; + #UnclosedGroup: Text; + #InvalidQuantifier: Text; +}; public type State = Nat; - public type TransitionTable = [(State, Transition, State)]; - public type Transition = { - #Char : Char; - #Range : (Char, Char); - #Any; - #Epsilon; - }; - - public type CompiledRegex = { - transitions : TransitionTable; - startState : State; - acceptStates : [State]; - captureGroups : [CaptureGroup]; - }; - - public type CaptureGroup = { +public type Transition = { + #Char : Char; + #Range : (Char, Char); + #Any; + #Epsilon; + #Group : { startState : State; endState : State; + captureIndex : ?Nat; }; + #Alternation; +}; + + +public type TransitionTable = [(State, Transition, State)]; + +public type CompiledRegex = { + transitions : TransitionTable; + startState : State; + acceptStates : [State]; +}; } \ No newline at end of file diff --git a/src/motoko_regex/teststrings b/src/motoko_regex/teststrings index be09a22..4afd835 100644 --- a/src/motoko_regex/teststrings +++ b/src/motoko_regex/teststrings @@ -5,4 +5,5 @@ \d+ [^0-9] [a-z]+ -([a-z]{5}-) \ No newline at end of file +((a)(b(c))) +(abc)+ \ No newline at end of file