-
Notifications
You must be signed in to change notification settings - Fork 116
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add multiaddr expression group matching
Support captures export some things wip thinking about public API Think about exposing meg as a public API doc comments Finish rename Add helper for meg and add test add comment for devs
- Loading branch information
Showing
6 changed files
with
545 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
// package meg implements Regular Expressions for multiaddr Components. It's short for "Megular Expressions" | ||
package meg | ||
|
||
// The developer is assumed to be familiar with the Thompson NFA approach to | ||
// regex before making changes to this file. Refer to | ||
// https://swtch.com/~rsc/regexp/regexp1.html for an introduction. | ||
|
||
import ( | ||
"fmt" | ||
"slices" | ||
) | ||
|
||
type stateKind uint8 | ||
|
||
const ( | ||
matchCode stateKind = iota | ||
split | ||
done | ||
) | ||
|
||
// MatchState is the Thompson NFA for a regular expression. | ||
type MatchState struct { | ||
capture captureFunc | ||
next *MatchState | ||
nextSplit *MatchState | ||
|
||
kind stateKind | ||
generation int | ||
code int | ||
} | ||
|
||
type captureFunc *func(string) error | ||
type captureMap map[captureFunc][]string | ||
|
||
func (cm captureMap) clone() captureMap { | ||
out := make(captureMap, len(cm)) | ||
for k, v := range cm { | ||
out[k] = slices.Clone(v) | ||
} | ||
return out | ||
} | ||
|
||
type statesAndCaptures struct { | ||
states []*MatchState | ||
captures []captureMap | ||
} | ||
|
||
func (s *MatchState) String() string { | ||
return fmt.Sprintf("state{kind: %d, generation: %d, code: %d}", s.kind, s.generation, s.code) | ||
} | ||
|
||
type Matchable interface { | ||
Code() int | ||
Value() string // Used when capturing the value | ||
} | ||
|
||
// Match returns whether the given Components match the Pattern defined in MatchState. | ||
// Errors are used to communicate capture errors. | ||
// If the error is non-nil the returned bool will be false. | ||
func Match[S ~[]T, T Matchable](s *MatchState, components S) (bool, error) { | ||
listGeneration := s.generation + 1 // Start at the last generation + 1 | ||
defer func() { s.generation = listGeneration }() // In case we reuse this state, store our highest generation number | ||
|
||
currentStates := statesAndCaptures{ | ||
states: make([]*MatchState, 0, 16), | ||
captures: make([]captureMap, 0, 16), | ||
} | ||
nextStates := statesAndCaptures{ | ||
states: make([]*MatchState, 0, 16), | ||
captures: make([]captureMap, 0, 16), | ||
} | ||
|
||
currentStates = appendState(currentStates, s, nil, listGeneration) | ||
|
||
for _, c := range components { | ||
if len(currentStates.states) == 0 { | ||
return false, nil | ||
} | ||
for i, s := range currentStates.states { | ||
if s.kind == matchCode && s.code == c.Code() { | ||
cm := currentStates.captures[i] | ||
if s.capture != nil { | ||
cm[s.capture] = append(cm[s.capture], c.Value()) | ||
} | ||
nextStates = appendState(nextStates, s.next, currentStates.captures[i], listGeneration) | ||
} | ||
} | ||
currentStates, nextStates = nextStates, currentStates | ||
nextStates.states = nextStates.states[:0] | ||
nextStates.captures = nextStates.captures[:0] | ||
listGeneration++ | ||
} | ||
|
||
for i, s := range currentStates.states { | ||
if s.kind == done { | ||
// We found a complete path. Run the captures now | ||
for f, v := range currentStates.captures[i] { | ||
for _, s := range v { | ||
if err := (*f)(s); err != nil { | ||
return false, err | ||
} | ||
} | ||
} | ||
return true, nil | ||
} | ||
} | ||
return false, nil | ||
} | ||
|
||
func appendState(arr statesAndCaptures, s *MatchState, c captureMap, listGeneration int) statesAndCaptures { | ||
if s == nil || s.generation == listGeneration { | ||
return arr | ||
} | ||
if c == nil { | ||
c = make(captureMap) | ||
} | ||
s.generation = listGeneration | ||
if s.kind == split { | ||
arr = appendState(arr, s.next, c, listGeneration) | ||
arr = appendState(arr, s.nextSplit, c.clone(), listGeneration) | ||
} else { | ||
arr.states = append(arr.states, s) | ||
arr.captures = append(arr.captures, c) | ||
} | ||
return arr | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,233 @@ | ||
package meg | ||
|
||
import ( | ||
"regexp" | ||
"slices" | ||
"testing" | ||
"testing/quick" | ||
) | ||
|
||
type codeAndValue struct { | ||
code int | ||
val string // Uses the string type to ensure immutability. | ||
} | ||
|
||
// Code implements Matchable. | ||
func (c codeAndValue) Code() int { | ||
return c.code | ||
} | ||
|
||
// Value implements Matchable. | ||
func (c codeAndValue) Value() string { | ||
return c.val | ||
} | ||
|
||
var _ Matchable = codeAndValue{} | ||
|
||
func TestSimple(t *testing.T) { | ||
type testCase struct { | ||
pattern *MatchState | ||
skipQuickCheck bool | ||
shouldMatch [][]int | ||
shouldNotMatch [][]int | ||
} | ||
testCases := | ||
[]testCase{ | ||
{ | ||
pattern: PatternToMatchState(Val(0), Val(1)), | ||
shouldMatch: [][]int{{0, 1}}, | ||
shouldNotMatch: [][]int{ | ||
{0}, | ||
{0, 0}, | ||
{0, 1, 0}, | ||
}}, { | ||
pattern: PatternToMatchState(Val(0), Val(1), Optional(Val(2))), | ||
shouldMatch: [][]int{ | ||
{0, 1, 2}, | ||
{0, 1}, | ||
}, | ||
shouldNotMatch: [][]int{ | ||
{0}, | ||
{0, 0}, | ||
{0, 1, 0}, | ||
{0, 1, 2, 0}, | ||
}}, { | ||
pattern: PatternToMatchState(Val(0), Val(1), OneOrMore(2)), | ||
skipQuickCheck: true, | ||
shouldMatch: [][]int{ | ||
{0, 1, 2, 2, 2, 2}, | ||
{0, 1, 2}, | ||
}, | ||
shouldNotMatch: [][]int{ | ||
{0}, | ||
{0, 0}, | ||
{0, 1}, | ||
{0, 1, 0}, | ||
{0, 1, 1, 0}, | ||
{0, 1, 2, 0}, | ||
}}, | ||
} | ||
|
||
for i, tc := range testCases { | ||
for _, m := range tc.shouldMatch { | ||
if matches, _ := Match(tc.pattern, codesToCodeAndValue(m)); !matches { | ||
t.Fatalf("failed to match %v with %s. idx=%d", m, tc.pattern, i) | ||
} | ||
} | ||
for _, m := range tc.shouldNotMatch { | ||
if matches, _ := Match(tc.pattern, codesToCodeAndValue(m)); matches { | ||
t.Fatalf("failed to not match %v with %s. idx=%d", m, tc.pattern, i) | ||
} | ||
} | ||
if tc.skipQuickCheck { | ||
continue | ||
} | ||
if err := quick.Check(func(notMatch []int) bool { | ||
for _, shouldMatch := range tc.shouldMatch { | ||
if slices.Equal(notMatch, shouldMatch) { | ||
// The random `notMatch` is actually something that shouldMatch. Skip it. | ||
return true | ||
} | ||
} | ||
matches, _ := Match(tc.pattern, codesToCodeAndValue(notMatch)) | ||
return !matches | ||
}, &quick.Config{}); err != nil { | ||
t.Fatal(err) | ||
} | ||
} | ||
} | ||
|
||
func TestCapture(t *testing.T) { | ||
type setupStateAndAssert func() (*MatchState, func()) | ||
type testCase struct { | ||
setup setupStateAndAssert | ||
parts []codeAndValue | ||
} | ||
|
||
testCases := | ||
[]testCase{ | ||
{ | ||
setup: func() (*MatchState, func()) { | ||
var code0str string | ||
return PatternToMatchState(CaptureVal(0, &code0str), Val(1)), func() { | ||
if code0str != "hello" { | ||
panic("unexpected value") | ||
} | ||
} | ||
}, | ||
parts: []codeAndValue{{0, "hello"}, {1, "world"}}, | ||
}, | ||
{ | ||
setup: func() (*MatchState, func()) { | ||
var code0strs []string | ||
return PatternToMatchState(CaptureOneOrMore(0, &code0strs), Val(1)), func() { | ||
if code0strs[0] != "hello" { | ||
panic("unexpected value") | ||
} | ||
if code0strs[1] != "world" { | ||
panic("unexpected value") | ||
} | ||
} | ||
}, | ||
parts: []codeAndValue{{0, "hello"}, {0, "world"}, {1, ""}}, | ||
}, | ||
} | ||
|
||
_ = testCases | ||
for _, tc := range testCases { | ||
state, assert := tc.setup() | ||
if matches, _ := Match(state, tc.parts); !matches { | ||
t.Fatalf("failed to match %v with %s", tc.parts, state) | ||
} | ||
assert() | ||
} | ||
} | ||
|
||
func codesToCodeAndValue(codes []int) []codeAndValue { | ||
out := make([]codeAndValue, len(codes)) | ||
for i, c := range codes { | ||
out[i] = codeAndValue{code: c} | ||
} | ||
return out | ||
} | ||
|
||
func bytesToCodeAndValue(codes []byte) []codeAndValue { | ||
out := make([]codeAndValue, len(codes)) | ||
for i, c := range codes { | ||
out[i] = codeAndValue{code: int(c)} | ||
} | ||
return out | ||
} | ||
|
||
// FuzzMatchesRegexpBehavior fuzz tests the expression matcher by comparing it to the behavior of the regexp package. | ||
func FuzzMatchesRegexpBehavior(f *testing.F) { | ||
bytesToRegexpAndPattern := func(exp []byte) ([]byte, []Pattern) { | ||
if len(exp) < 3 { | ||
panic("regexp too short") | ||
} | ||
pattern := make([]Pattern, 0, len(exp)-2) | ||
for i, b := range exp { | ||
b = b % 32 | ||
if i == 0 { | ||
exp[i] = '^' | ||
continue | ||
} else if i == len(exp)-1 { | ||
exp[i] = '$' | ||
continue | ||
} | ||
switch { | ||
case b < 26: | ||
exp[i] = b + 'a' | ||
pattern = append(pattern, Val(int(exp[i]))) | ||
case i > 1 && b == 26: | ||
exp[i] = '?' | ||
pattern = pattern[:len(pattern)-1] | ||
pattern = append(pattern, Optional(Val(int(exp[i-1])))) | ||
case i > 1 && b == 27: | ||
exp[i] = '*' | ||
pattern = pattern[:len(pattern)-1] | ||
pattern = append(pattern, ZeroOrMore(int(exp[i-1]))) | ||
case i > 1 && b == 28: | ||
exp[i] = '+' | ||
pattern = pattern[:len(pattern)-1] | ||
pattern = append(pattern, OneOrMore(int(exp[i-1]))) | ||
default: | ||
exp[i] = 'a' | ||
pattern = append(pattern, Val(int(exp[i]))) | ||
} | ||
} | ||
|
||
return exp, pattern | ||
} | ||
|
||
simplifyB := func(buf []byte) []byte { | ||
for i, b := range buf { | ||
buf[i] = (b % 26) + 'a' | ||
} | ||
return buf | ||
} | ||
|
||
f.Fuzz(func(t *testing.T, expRules []byte, corpus []byte) { | ||
if len(expRules) < 3 || len(expRules) > 1024 || len(corpus) > 1024 { | ||
return | ||
} | ||
corpus = simplifyB(corpus) | ||
regexpPattern, pattern := bytesToRegexpAndPattern(expRules) | ||
matched, err := regexp.Match(string(regexpPattern), corpus) | ||
if err != nil { | ||
// Malformed regex. Ignore | ||
return | ||
} | ||
p := PatternToMatchState(pattern...) | ||
otherMatched, _ := Match(p, bytesToCodeAndValue(corpus)) | ||
if otherMatched != matched { | ||
t.Log("regexp", string(regexpPattern)) | ||
t.Log("corpus", string(corpus)) | ||
m2, err2 := regexp.Match(string(regexpPattern), corpus) | ||
t.Logf("regexp matched %v. %v. %v, %v. \n%v - \n%v", matched, err, m2, err2, regexpPattern, corpus) | ||
t.Logf("pattern %+v", pattern) | ||
t.Fatalf("mismatched results: %v %v %v", otherMatched, matched, p) | ||
} | ||
}) | ||
|
||
} |
Oops, something went wrong.