Skip to content

Commit

Permalink
Add multiaddr expression group matching
Browse files Browse the repository at this point in the history
Support captures

export some things

wip thinking about public API

Think about exposing meg as a public API

doc comments

Finish rename

Add helper for meg and add test

add comment for devs
  • Loading branch information
MarcoPolo committed Jan 20, 2025
1 parent 3ca4833 commit 126f9ef
Show file tree
Hide file tree
Showing 6 changed files with 545 additions and 0 deletions.
4 changes: 4 additions & 0 deletions component.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,10 @@ func (c Component) Protocol() Protocol {
return c.protocol
}

func (c Component) Code() int {
return c.Protocol().Code
}

func (c Component) RawValue() []byte {
return []byte(c.bytes[c.offset:])
}
Expand Down
126 changes: 126 additions & 0 deletions meg/meg.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
// package meg implements Regular Expressions for multiaddr Components. It's short for "Megular Expressions"
package meg

// The developer is assumed to be familiar with the Thompson NFA approach to
// regex before making changes to this file. Refer to
// https://swtch.com/~rsc/regexp/regexp1.html for an introduction.

import (
"fmt"
"slices"
)

type stateKind uint8

const (
matchCode stateKind = iota
split
done
)

// MatchState is the Thompson NFA for a regular expression.
type MatchState struct {
capture captureFunc
next *MatchState
nextSplit *MatchState

kind stateKind
generation int
code int
}

type captureFunc *func(string) error
type captureMap map[captureFunc][]string

func (cm captureMap) clone() captureMap {
out := make(captureMap, len(cm))
for k, v := range cm {
out[k] = slices.Clone(v)
}
return out
}

type statesAndCaptures struct {
states []*MatchState
captures []captureMap
}

func (s *MatchState) String() string {
return fmt.Sprintf("state{kind: %d, generation: %d, code: %d}", s.kind, s.generation, s.code)

Check warning on line 49 in meg/meg.go

View check run for this annotation

Codecov / codecov/patch

meg/meg.go#L48-L49

Added lines #L48 - L49 were not covered by tests
}

type Matchable interface {
Code() int
Value() string // Used when capturing the value
}

// Match returns whether the given Components match the Pattern defined in MatchState.
// Errors are used to communicate capture errors.
// If the error is non-nil the returned bool will be false.
func Match[S ~[]T, T Matchable](s *MatchState, components S) (bool, error) {
listGeneration := s.generation + 1 // Start at the last generation + 1
defer func() { s.generation = listGeneration }() // In case we reuse this state, store our highest generation number

currentStates := statesAndCaptures{
states: make([]*MatchState, 0, 16),
captures: make([]captureMap, 0, 16),
}
nextStates := statesAndCaptures{
states: make([]*MatchState, 0, 16),
captures: make([]captureMap, 0, 16),
}

currentStates = appendState(currentStates, s, nil, listGeneration)

for _, c := range components {
if len(currentStates.states) == 0 {
return false, nil
}
for i, s := range currentStates.states {
if s.kind == matchCode && s.code == c.Code() {
cm := currentStates.captures[i]
if s.capture != nil {
cm[s.capture] = append(cm[s.capture], c.Value())
}
nextStates = appendState(nextStates, s.next, currentStates.captures[i], listGeneration)
}
}
currentStates, nextStates = nextStates, currentStates
nextStates.states = nextStates.states[:0]
nextStates.captures = nextStates.captures[:0]
listGeneration++
}

for i, s := range currentStates.states {
if s.kind == done {
// We found a complete path. Run the captures now
for f, v := range currentStates.captures[i] {
for _, s := range v {
if err := (*f)(s); err != nil {
return false, err
}

Check warning on line 101 in meg/meg.go

View check run for this annotation

Codecov / codecov/patch

meg/meg.go#L100-L101

Added lines #L100 - L101 were not covered by tests
}
}
return true, nil
}
}
return false, nil
}

func appendState(arr statesAndCaptures, s *MatchState, c captureMap, listGeneration int) statesAndCaptures {
if s == nil || s.generation == listGeneration {
return arr
}

Check warning on line 113 in meg/meg.go

View check run for this annotation

Codecov / codecov/patch

meg/meg.go#L112-L113

Added lines #L112 - L113 were not covered by tests
if c == nil {
c = make(captureMap)
}
s.generation = listGeneration
if s.kind == split {
arr = appendState(arr, s.next, c, listGeneration)
arr = appendState(arr, s.nextSplit, c.clone(), listGeneration)
} else {
arr.states = append(arr.states, s)
arr.captures = append(arr.captures, c)
}
return arr
}
233 changes: 233 additions & 0 deletions meg/meg_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
package meg

import (
"regexp"
"slices"
"testing"
"testing/quick"
)

type codeAndValue struct {
code int
val string // Uses the string type to ensure immutability.
}

// Code implements Matchable.
func (c codeAndValue) Code() int {
return c.code
}

// Value implements Matchable.
func (c codeAndValue) Value() string {
return c.val
}

var _ Matchable = codeAndValue{}

func TestSimple(t *testing.T) {
type testCase struct {
pattern *MatchState
skipQuickCheck bool
shouldMatch [][]int
shouldNotMatch [][]int
}
testCases :=
[]testCase{
{
pattern: PatternToMatchState(Val(0), Val(1)),
shouldMatch: [][]int{{0, 1}},
shouldNotMatch: [][]int{
{0},
{0, 0},
{0, 1, 0},
}}, {
pattern: PatternToMatchState(Val(0), Val(1), Optional(Val(2))),
shouldMatch: [][]int{
{0, 1, 2},
{0, 1},
},
shouldNotMatch: [][]int{
{0},
{0, 0},
{0, 1, 0},
{0, 1, 2, 0},
}}, {
pattern: PatternToMatchState(Val(0), Val(1), OneOrMore(2)),
skipQuickCheck: true,
shouldMatch: [][]int{
{0, 1, 2, 2, 2, 2},
{0, 1, 2},
},
shouldNotMatch: [][]int{
{0},
{0, 0},
{0, 1},
{0, 1, 0},
{0, 1, 1, 0},
{0, 1, 2, 0},
}},
}

for i, tc := range testCases {
for _, m := range tc.shouldMatch {
if matches, _ := Match(tc.pattern, codesToCodeAndValue(m)); !matches {
t.Fatalf("failed to match %v with %s. idx=%d", m, tc.pattern, i)
}
}
for _, m := range tc.shouldNotMatch {
if matches, _ := Match(tc.pattern, codesToCodeAndValue(m)); matches {
t.Fatalf("failed to not match %v with %s. idx=%d", m, tc.pattern, i)
}
}
if tc.skipQuickCheck {
continue
}
if err := quick.Check(func(notMatch []int) bool {
for _, shouldMatch := range tc.shouldMatch {
if slices.Equal(notMatch, shouldMatch) {
// The random `notMatch` is actually something that shouldMatch. Skip it.
return true
}
}
matches, _ := Match(tc.pattern, codesToCodeAndValue(notMatch))
return !matches
}, &quick.Config{}); err != nil {
t.Fatal(err)
}
}
}

func TestCapture(t *testing.T) {
type setupStateAndAssert func() (*MatchState, func())
type testCase struct {
setup setupStateAndAssert
parts []codeAndValue
}

testCases :=
[]testCase{
{
setup: func() (*MatchState, func()) {
var code0str string
return PatternToMatchState(CaptureVal(0, &code0str), Val(1)), func() {
if code0str != "hello" {
panic("unexpected value")
}
}
},
parts: []codeAndValue{{0, "hello"}, {1, "world"}},
},
{
setup: func() (*MatchState, func()) {
var code0strs []string
return PatternToMatchState(CaptureOneOrMore(0, &code0strs), Val(1)), func() {
if code0strs[0] != "hello" {
panic("unexpected value")
}
if code0strs[1] != "world" {
panic("unexpected value")
}
}
},
parts: []codeAndValue{{0, "hello"}, {0, "world"}, {1, ""}},
},
}

_ = testCases
for _, tc := range testCases {
state, assert := tc.setup()
if matches, _ := Match(state, tc.parts); !matches {
t.Fatalf("failed to match %v with %s", tc.parts, state)
}
assert()
}
}

func codesToCodeAndValue(codes []int) []codeAndValue {
out := make([]codeAndValue, len(codes))
for i, c := range codes {
out[i] = codeAndValue{code: c}
}
return out
}

func bytesToCodeAndValue(codes []byte) []codeAndValue {
out := make([]codeAndValue, len(codes))
for i, c := range codes {
out[i] = codeAndValue{code: int(c)}
}
return out
}

// FuzzMatchesRegexpBehavior fuzz tests the expression matcher by comparing it to the behavior of the regexp package.
func FuzzMatchesRegexpBehavior(f *testing.F) {
bytesToRegexpAndPattern := func(exp []byte) ([]byte, []Pattern) {
if len(exp) < 3 {
panic("regexp too short")
}
pattern := make([]Pattern, 0, len(exp)-2)
for i, b := range exp {
b = b % 32
if i == 0 {
exp[i] = '^'
continue
} else if i == len(exp)-1 {
exp[i] = '$'
continue
}
switch {
case b < 26:
exp[i] = b + 'a'
pattern = append(pattern, Val(int(exp[i])))
case i > 1 && b == 26:
exp[i] = '?'
pattern = pattern[:len(pattern)-1]
pattern = append(pattern, Optional(Val(int(exp[i-1]))))
case i > 1 && b == 27:
exp[i] = '*'
pattern = pattern[:len(pattern)-1]
pattern = append(pattern, ZeroOrMore(int(exp[i-1])))
case i > 1 && b == 28:
exp[i] = '+'
pattern = pattern[:len(pattern)-1]
pattern = append(pattern, OneOrMore(int(exp[i-1])))
default:
exp[i] = 'a'
pattern = append(pattern, Val(int(exp[i])))
}
}

return exp, pattern
}

simplifyB := func(buf []byte) []byte {
for i, b := range buf {
buf[i] = (b % 26) + 'a'
}
return buf
}

f.Fuzz(func(t *testing.T, expRules []byte, corpus []byte) {
if len(expRules) < 3 || len(expRules) > 1024 || len(corpus) > 1024 {
return
}
corpus = simplifyB(corpus)
regexpPattern, pattern := bytesToRegexpAndPattern(expRules)
matched, err := regexp.Match(string(regexpPattern), corpus)
if err != nil {
// Malformed regex. Ignore
return
}
p := PatternToMatchState(pattern...)
otherMatched, _ := Match(p, bytesToCodeAndValue(corpus))
if otherMatched != matched {
t.Log("regexp", string(regexpPattern))
t.Log("corpus", string(corpus))
m2, err2 := regexp.Match(string(regexpPattern), corpus)
t.Logf("regexp matched %v. %v. %v, %v. \n%v - \n%v", matched, err, m2, err2, regexpPattern, corpus)
t.Logf("pattern %+v", pattern)
t.Fatalf("mismatched results: %v %v %v", otherMatched, matched, p)
}
})

}
Loading

0 comments on commit 126f9ef

Please sign in to comment.