Initial Commit, Changelog update to follow

Demali-876 · Oct 8, 2024 · 223b153 · 223b153
1 parent 420f70a
commit 223b153
Show file tree

Hide file tree

Showing 9 changed files with 531 additions and 393 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,9 +3,11 @@
 ## 🚀 New Features
 
 - **Flattened AST Structure**
-  - Introduced a new, flattened AST structure to simplify regex expression handling. This includes using lists for concatenations and alternations, reducing the depth of nested expressions.
+  - [#420f70a](https://github.com/Demali-876/motoko_regex_engine/commit/420f70a46a0ace335d2be8631b2c372022b8f2f2) **Date: 10-07-2024 | New:** Introduced a new, flattened AST structure to simplify regex expression handling. This includes using lists for concatenations and alternations, reducing the depth of nested expressions.
 - **Single Group Token**
-  - Introduced a unified `#Group` token that now encapsulates the group modifier and sub-expression, streamlining group handling in both the lexer and the parser.
+  - [#420f70a](https://github.com/Demali-876/motoko_regex_engine/commit/420f70a46a0ace335d2be8631b2c372022b8f2f2) **Date: 10-07-2024 | New:** Introduced a unified `#Group` token that now encapsulates the group modifier and sub-expression, streamlining group handling in both the lexer and the parser.
+- **Capture Index Tracking**
+  - Implemented capture group index tracking for pre-matching optimization.
 
 ## 🐛 Bug Fixes
 
@@ -27,26 +29,31 @@
 ## 🔄 Changes
 
 - **Reduced Token Count in Lexer**
-  - Reduced the number of tokens in the lexer by eliminating `#GroupStart` and `#GroupEnd` in favor of a single `#Group` token, simplifying group handling during the parsing process.
+  - [#420f70a](https://github.com/Demali-876/motoko_regex_engine/commit/420f70a46a0ace335d2be8631b2c372022b8f2f2) **Date: 10-07-2024 | Improved:**Reduced the number of tokens in the lexer by eliminating `#GroupStart` and `#GroupEnd` in favor of a single `#Group` token, simplifying group handling during the parsing process.
 - **Unified Group Token**
-  - Removed the standalone `GroupModifierType` token, integrating it into the `#Group` token type.
+  - [#420f70a](https://github.com/Demali-876/motoko_regex_engine/commit/420f70a46a0ace335d2be8631b2c372022b8f2f2) **Date: 10-07-2024 | Improved :** Removed the standalone `GroupModifierType` token, integrating it into the `#Group` token type.
 - **NextToken() Improvements**
-  - Removed redundant cases.
+  - [#420f70a](https://github.com/Demali-876/motoko_regex_engine/commit/420f70a46a0ace335d2be8631b2c372022b8f2f2) **Date: 10-07-2024 | Improved:** Removed redundant cases.
+- **Complete Parser and Compiler Redesign**
+  - Optimized NFA generation using Thompson's Construction Algorithm.
+    - Complete Parser rework using new ast structure.
+    - Introduction of Parser Errors. Parser now produces an AST or an Error.
 
 ## ❌ Removed
 
 - [#6279c34](https://github.com/Demali-876/motoko_regex_engine/commit/6279c34557a50328ac43555533fbf5708f867679) **Date: 10-03-2024 | Removed:** Removed `#QuantifierRange` token. All ranges are now handled by the `#Quantifier` token.
+- []() Eliminated Redundant Tokens
 
 ---
 
 ### **To-Do Checklist**
 
-- [ ] **Parser Overhaul**
-  - [ ] Adapt the parser to utilize the new flattened AST structure.
-  - [ ] Ensure proper handling of the unified `#Group` token, including its modifiers and sub-expression references.
+- [x] **Parser Overhaul | Status: Completed  Date: 10-08-2024** Commit: 
+  - [x] Adapt the parser to utilize the new flattened AST structure.
+  - [x] Ensure proper handling of the unified `#Group` token, including its modifiers and sub-expression references.
 
-- [ ] **NFA Construction**
-  - [ ] Refactor NFA construction to take advantage of the flattened AST for incremental optimization.
+- [x] **NFA Construction | Status: Started Date: 10-08-2024** Commit: 
+  - [x] Refactor NFA construction to take advantage of the flattened AST for incremental optimization.
   - [ ] Implement state reduction and bisimulation in the NFA to prevent state explosion.
 
 - [ ] **Incremental Optimization**

diff --git a/src/motoko_regex/Compiler.mo b/src/motoko_regex/Compiler.mo
diff --git a/src/motoko_regex/Extensions.mo b/src/motoko_regex/Extensions.mo
@@ -150,17 +150,21 @@ module{
     Array.tabulate<T>(end - start, func(i:Nat) { arr[start + i] })
     };
     //Error handling
-    public func errorToText(error : Types.LexerError) : Text {
-      switch (error) {
+    public func errorToText(error: Types.RegexError): Text {
+    switch (error) {
         case (#GenericError(text)) text;
         case (#InvalidEscapeSequence(char)) "Invalid escape sequence: " # Char.toText(char);
         case (#InvalidQuantifierRange(text)) "Invalid quantifier range: " # text;
         case (#MismatchedParenthesis(left, right)) "Mismatched parenthesis: " # Char.toText(left) # " and " # Char.toText(right);
         case (#UnexpectedCharacter(char)) "Unexpected character: " # Char.toText(char);
         case (#UnexpectedEndOfInput) "Unexpected end of input";
         case (#UnmatchedParenthesis(char)) "Unmatched parenthesis: " # Char.toText(char);
-      };
+        case (#UnexpectedToken(tokenType)) "Unexpected token: " # debug_show(tokenType);
+        case (#UnclosedGroup(text)) "Unclosed group: " # text;
+        case (#InvalidQuantifier(text)) "Invalid quantifier: " # text;
+    };
     };
+
     //replace an element of a buffer given an index
     public func replace<T>(buffer: Buffer.Buffer<T>, index: Nat, newElement: T) {
     if (index >= buffer.size()) {

diff --git a/src/motoko_regex/Lexer.mo b/src/motoko_regex/Lexer.mo
@@ -11,7 +11,7 @@ import Cursor "Cursor";
 
 module {
   type Token = Types.Token;
-  type LexerError = Types.LexerError;
+  type LexerError = Types.RegexError;
   type CharacterClass = Types.CharacterClass;
   public class Lexer(input: Text) {
     let cursor = Cursor.Cursor(input);
@@ -194,7 +194,7 @@ module {
     if (not cursor.hasNext()) {
         return #err(#GenericError("Unexpected end of input at position " # Nat.toText(start)));
     };
-    cursor.inc();
+    cursor.inc(); // Consume the opening parenthesis
 
     let groupModifierResult = parseGroupModifier();
     var groupModifier: ?Types.GroupModifierType = null;
@@ -218,15 +218,15 @@ module {
         return #err(#GenericError("Expected closing parenthesis at position " # Nat.toText(cursor.getPos()) # ", found '" # Text.fromChar(cursor.current()) # "'"));
     };
 
-    cursor.inc();
+    cursor.inc(); // Consume the closing parenthesis
 
     let groupToken: Token = {
         tokenType = #Group({
             modifier = groupModifier;
             subTokens = subTokens;
         });
         value = Extensions.slice(input, start, ?cursor.getPos());
-        position = #Span(start, cursor.getPos() -1);
+        position = #Span(start, cursor.getPos() - 1);
     };
     #ok(groupToken)
 };
@@ -269,9 +269,10 @@ module {
     while (cursor.hasNext() and depth > 0) {
         switch (cursor.current()) {
             case '(' {
-                depth += 1;
                 switch (tokenizeGroup()) {
-                    case (#ok(token)) { subTokens.add(token) };
+                    case (#ok(token)) {
+                        subTokens.add(token);
+                    };
                     case (#err(error)) { return #err(error) };
                 };
             };
@@ -280,7 +281,7 @@ module {
                 if (depth == 0) {
                     return #ok(subTokens);
                 } else {
-                    cursor.inc();
+                    return #ok(subTokens);
                 };
             };
             case _ {
@@ -297,7 +298,7 @@ module {
     };
 
     #ok(subTokens)
-};
+    };
 
     private func tokenizeEscapedChar(): Result.Result<Token, LexerError> {
       cursor.inc();

diff --git a/src/motoko_regex/Optimizer.mo b/src/motoko_regex/Optimizer.mo
@@ -5,7 +5,7 @@ import Hash "mo:base/Hash";
 import Iter "mo:base/Iter";
 import Nat "mo:base/Nat";
 
-module {
+/* module {
   public class Optimizer() {
     private var epsilonClosureCache = HashMap.HashMap<Types.State, Buffer.Buffer<Types.State>>(10, Nat.equal, Hash.hash);
 
@@ -134,5 +134,5 @@ module {
       stateTransitionsMap
     };
   };
-}
+} */