Skip to content

Commit

Permalink
Initial Commit, Changelog update to follow
Browse files Browse the repository at this point in the history
  • Loading branch information
Demali-876 committed Oct 8, 2024
1 parent 420f70a commit 223b153
Show file tree
Hide file tree
Showing 9 changed files with 531 additions and 393 deletions.
27 changes: 17 additions & 10 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
## 🚀 New Features

- **Flattened AST Structure**
- Introduced a new, flattened AST structure to simplify regex expression handling. This includes using lists for concatenations and alternations, reducing the depth of nested expressions.
- [#420f70a](https://github.com/Demali-876/motoko_regex_engine/commit/420f70a46a0ace335d2be8631b2c372022b8f2f2) **Date: 10-07-2024 | New:** Introduced a new, flattened AST structure to simplify regex expression handling. This includes using lists for concatenations and alternations, reducing the depth of nested expressions.
- **Single Group Token**
- Introduced a unified `#Group` token that now encapsulates the group modifier and sub-expression, streamlining group handling in both the lexer and the parser.
- [#420f70a](https://github.com/Demali-876/motoko_regex_engine/commit/420f70a46a0ace335d2be8631b2c372022b8f2f2) **Date: 10-07-2024 | New:** Introduced a unified `#Group` token that now encapsulates the group modifier and sub-expression, streamlining group handling in both the lexer and the parser.
- **Capture Index Tracking**
- Implemented capture group index tracking for pre-matching optimization.

## 🐛 Bug Fixes

Expand All @@ -27,26 +29,31 @@
## 🔄 Changes

- **Reduced Token Count in Lexer**
- Reduced the number of tokens in the lexer by eliminating `#GroupStart` and `#GroupEnd` in favor of a single `#Group` token, simplifying group handling during the parsing process.
- [#420f70a](https://github.com/Demali-876/motoko_regex_engine/commit/420f70a46a0ace335d2be8631b2c372022b8f2f2) **Date: 10-07-2024 | Improved:**Reduced the number of tokens in the lexer by eliminating `#GroupStart` and `#GroupEnd` in favor of a single `#Group` token, simplifying group handling during the parsing process.
- **Unified Group Token**
- Removed the standalone `GroupModifierType` token, integrating it into the `#Group` token type.
- [#420f70a](https://github.com/Demali-876/motoko_regex_engine/commit/420f70a46a0ace335d2be8631b2c372022b8f2f2) **Date: 10-07-2024 | Improved :** Removed the standalone `GroupModifierType` token, integrating it into the `#Group` token type.
- **NextToken() Improvements**
- Removed redundant cases.
- [#420f70a](https://github.com/Demali-876/motoko_regex_engine/commit/420f70a46a0ace335d2be8631b2c372022b8f2f2) **Date: 10-07-2024 | Improved:** Removed redundant cases.
- **Complete Parser and Compiler Redesign**
- Optimized NFA generation using Thompson's Construction Algorithm.
- Complete Parser rework using new ast structure.
- Introduction of Parser Errors. Parser now produces an AST or an Error.

## ❌ Removed

- [#6279c34](https://github.com/Demali-876/motoko_regex_engine/commit/6279c34557a50328ac43555533fbf5708f867679) **Date: 10-03-2024 | Removed:** Removed `#QuantifierRange` token. All ranges are now handled by the `#Quantifier` token.
- []() Eliminated Redundant Tokens

---

### **To-Do Checklist**

- [ ] **Parser Overhaul**
- [ ] Adapt the parser to utilize the new flattened AST structure.
- [ ] Ensure proper handling of the unified `#Group` token, including its modifiers and sub-expression references.
- [x] **Parser Overhaul | Status: Completed Date: 10-08-2024** Commit:
- [x] Adapt the parser to utilize the new flattened AST structure.
- [x] Ensure proper handling of the unified `#Group` token, including its modifiers and sub-expression references.

- [ ] **NFA Construction**
- [ ] Refactor NFA construction to take advantage of the flattened AST for incremental optimization.
- [x] **NFA Construction | Status: Started Date: 10-08-2024** Commit:
- [x] Refactor NFA construction to take advantage of the flattened AST for incremental optimization.
- [ ] Implement state reduction and bisimulation in the NFA to prevent state explosion.

- [ ] **Incremental Optimization**
Expand Down
560 changes: 312 additions & 248 deletions src/motoko_regex/Compiler.mo

Large diffs are not rendered by default.

10 changes: 7 additions & 3 deletions src/motoko_regex/Extensions.mo
Original file line number Diff line number Diff line change
Expand Up @@ -150,17 +150,21 @@ module{
Array.tabulate<T>(end - start, func(i:Nat) { arr[start + i] })
};
//Error handling
public func errorToText(error : Types.LexerError) : Text {
switch (error) {
public func errorToText(error: Types.RegexError): Text {
switch (error) {
case (#GenericError(text)) text;
case (#InvalidEscapeSequence(char)) "Invalid escape sequence: " # Char.toText(char);
case (#InvalidQuantifierRange(text)) "Invalid quantifier range: " # text;
case (#MismatchedParenthesis(left, right)) "Mismatched parenthesis: " # Char.toText(left) # " and " # Char.toText(right);
case (#UnexpectedCharacter(char)) "Unexpected character: " # Char.toText(char);
case (#UnexpectedEndOfInput) "Unexpected end of input";
case (#UnmatchedParenthesis(char)) "Unmatched parenthesis: " # Char.toText(char);
};
case (#UnexpectedToken(tokenType)) "Unexpected token: " # debug_show(tokenType);
case (#UnclosedGroup(text)) "Unclosed group: " # text;
case (#InvalidQuantifier(text)) "Invalid quantifier: " # text;
};
};

//replace an element of a buffer given an index
public func replace<T>(buffer: Buffer.Buffer<T>, index: Nat, newElement: T) {
if (index >= buffer.size()) {
Expand Down
17 changes: 9 additions & 8 deletions src/motoko_regex/Lexer.mo
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import Cursor "Cursor";

module {
type Token = Types.Token;
type LexerError = Types.LexerError;
type LexerError = Types.RegexError;
type CharacterClass = Types.CharacterClass;
public class Lexer(input: Text) {
let cursor = Cursor.Cursor(input);
Expand Down Expand Up @@ -194,7 +194,7 @@ module {
if (not cursor.hasNext()) {
return #err(#GenericError("Unexpected end of input at position " # Nat.toText(start)));
};
cursor.inc();
cursor.inc(); // Consume the opening parenthesis

let groupModifierResult = parseGroupModifier();
var groupModifier: ?Types.GroupModifierType = null;
Expand All @@ -218,15 +218,15 @@ module {
return #err(#GenericError("Expected closing parenthesis at position " # Nat.toText(cursor.getPos()) # ", found '" # Text.fromChar(cursor.current()) # "'"));
};

cursor.inc();
cursor.inc(); // Consume the closing parenthesis

let groupToken: Token = {
tokenType = #Group({
modifier = groupModifier;
subTokens = subTokens;
});
value = Extensions.slice(input, start, ?cursor.getPos());
position = #Span(start, cursor.getPos() -1);
position = #Span(start, cursor.getPos() - 1);
};
#ok(groupToken)
};
Expand Down Expand Up @@ -269,9 +269,10 @@ module {
while (cursor.hasNext() and depth > 0) {
switch (cursor.current()) {
case '(' {
depth += 1;
switch (tokenizeGroup()) {
case (#ok(token)) { subTokens.add(token) };
case (#ok(token)) {
subTokens.add(token);
};
case (#err(error)) { return #err(error) };
};
};
Expand All @@ -280,7 +281,7 @@ module {
if (depth == 0) {
return #ok(subTokens);
} else {
cursor.inc();
return #ok(subTokens);
};
};
case _ {
Expand All @@ -297,7 +298,7 @@ module {
};

#ok(subTokens)
};
};

private func tokenizeEscapedChar(): Result.Result<Token, LexerError> {
cursor.inc();
Expand Down
4 changes: 2 additions & 2 deletions src/motoko_regex/Optimizer.mo
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import Hash "mo:base/Hash";
import Iter "mo:base/Iter";
import Nat "mo:base/Nat";

module {
/* module {
public class Optimizer() {
private var epsilonClosureCache = HashMap.HashMap<Types.State, Buffer.Buffer<Types.State>>(10, Nat.equal, Hash.hash);
Expand Down Expand Up @@ -134,5 +134,5 @@ module {
stateTransitionsMap
};
};
}
} */

Loading

0 comments on commit 223b153

Please sign in to comment.