-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathmonocase.go
89 lines (84 loc) · 3.46 KB
/
monocase.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
package quamina
import (
"errors"
"fmt"
"unicode/utf8"
)
func readMonocaseSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typedVal, err error) {
t, err := pb.jd.Token()
if err != nil {
return
}
pathVals = valsIn
monocaseString, ok := t.(string)
if !ok {
err = errors.New("value for 'prefix' must be a string")
return
}
val := typedVal{
vType: monocaseType,
val: `"` + monocaseString + `"`,
}
pathVals = append(pathVals, val)
// has to be } or tokenizer will throw error
_, err = pb.jd.Token()
return
}
// makeMonocaseFA builds a FA to match "ignore-case" patterns. The Unicode Standard specifies algorithm 3.13,
// relying on the file CaseFolding.txt in the Unicode Character Database. This function uses the "Simple" flavor
// of casefolding, i.e. the lines in CaseFolding.txt that are marked with "C". The discussion in the Unicode
// standard doesn't mention this, but the algorithm essentially replaces upper-case characters with lower-case
// equivalents.
// We need to exercise caution to keep from creating states wastefully. For "CAT", after matching '"',
// you transition on either 'c' or 'C' but in this particular case you want to transition to the same
// next state. Note that there are many characters in Unicode where the upper and lower case forms are
// multi-byte and in fact not even the same number of bytes. So in that case you need two paths forward that step
// through the bytes of each form and then rejoin to arrive at a state. Also note
// that in many cases the upper/lower case versions of a rune have leading bytes in common
func makeMonocaseFA(val []byte, pp printer) (*smallTable, *fieldMatcher) {
fm := newFieldMatcher()
index := 0
table := newSmallTable() // start state
startTable := table
var nextStep *faNext
for index < len(val) {
var orig, alt []byte
r, width := utf8.DecodeRune(val[index:])
orig = val[index : index+width]
altRune, ok := caseFoldingPairs[r]
if ok {
alt = make([]byte, utf8.RuneLen(altRune))
utf8.EncodeRune(alt, altRune)
}
nextStep = &faNext{states: []*faState{{table: newSmallTable()}}}
pp.labelTable(nextStep.states[0].table, fmt.Sprintf("On %d, alt=%v", val[index], alt))
if alt == nil {
// easy case, no casefolding issues. We should maybe try to coalesce these
// no-casefolding sections and only call makeFAFragment once for all of them
origFA := makeFAFragment(orig, nextStep, pp)
table.addByteStep(orig[0], origFA)
} else {
// two paths to next state
// but they might have a common prefix
var commonPrefix int
for commonPrefix = 0; orig[commonPrefix] == alt[commonPrefix]; commonPrefix++ {
prefixNext := &faNext{states: []*faState{{table: newSmallTable()}}}
table.addByteStep(orig[commonPrefix], prefixNext)
table = prefixNext.states[0].table
pp.labelTable(table, fmt.Sprintf("common prologue on %v", orig[commonPrefix]))
}
// now build automata for the orig and alt versions of the char
// TODO: make sure that makeFAFragment works with length == 1
origFA := makeFAFragment(orig[commonPrefix:], nextStep, pp)
altFA := makeFAFragment(alt[commonPrefix:], nextStep, pp)
table.addByteStep(orig[commonPrefix], origFA)
table.addByteStep(alt[commonPrefix], altFA)
}
table = nextStep.states[0].table
index += width
}
laststate := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{fm}}
lastStep := &faNext{states: []*faState{laststate}}
nextStep.states[0].table.addByteStep(valueTerminator, lastStep)
return startTable, fm
}