Skip to content

Commit

Permalink
encoding/json: simplify folded name logic
Browse files Browse the repository at this point in the history
The folded name logic (despite all attempts to optimize it)
was fundamentally an O(n) operation where every field in a struct
needed to be linearly scanned in order to find a match.
This made unmashaling of unknown fields always O(n).
Instead of optimizing the comparison for each field,
make it such that we can look up a name in O(1).

We accomplish this by maintaining a map keyed by pre-folded names,
which we can pre-calculate when processing the struct type.
Using a stack-allocated buffer, we can fold the input name and
look up its presence in the map.

Also, instead of mapping from names to indexes,
map directly to a pointer to the field information.
The memory cost of this is the same and avoids an extra slice index.

The new logic is both simpler and faster.

Performance:

	name                   old time/op    new time/op    delta
	CodeDecoder           2.47ms ± 4%    2.42ms ± 2%  -1.83%  (p=0.022 n=10+9)
	UnicodeDecoder         259ns ± 2%     248ns ± 1%  -4.32%  (p=0.000 n=10+10)
	DecoderStream          150ns ± 1%     149ns ± 1%    ~     (p=0.516 n=10+10)
	CodeUnmarshal         3.13ms ± 2%    3.09ms ± 2%  -1.37%  (p=0.022 n=10+9)
	CodeUnmarshalReuse    2.50ms ± 1%    2.45ms ± 1%  -1.96%  (p=0.001 n=8+9)
	UnmarshalString       67.1ns ± 5%    64.5ns ± 5%  -3.90%  (p=0.005 n=10+10)
	UnmarshalFloat64      60.1ns ± 4%    58.4ns ± 2%  -2.89%  (p=0.002 n=10+8)
	UnmarshalInt64        51.0ns ± 4%    49.2ns ± 1%  -3.53%  (p=0.001 n=10+8)
	Issue10335            80.7ns ± 2%    79.2ns ± 1%  -1.82%  (p=0.016 n=10+8)
	Issue34127            28.6ns ± 3%    28.8ns ± 3%    ~     (p=0.388 n=9+10)
	Unmapped               177ns ± 2%     177ns ± 2%    ~     (p=0.956 n=10+10)

Change-Id: I478b2b958f5a63a69c9a991a39cd5ffb43244a2a
Reviewed-on: https://go-review.googlesource.com/c/go/+/471196
Reviewed-by: Dmitri Shuralyov <[email protected]>
Run-TryBot: Joseph Tsai <[email protected]>
Auto-Submit: Joseph Tsai <[email protected]>
TryBot-Result: Gopher Robot <[email protected]>
Reviewed-by: Johan Brandhorst-Satzkorn <[email protected]>
Reviewed-by: Than McIntosh <[email protected]>
Reviewed-by: Daniel Martí <[email protected]>
  • Loading branch information
dsnet authored and gopherbot committed Feb 27, 2023
1 parent 2de406b commit b9b8cec
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 238 deletions.
17 changes: 3 additions & 14 deletions src/encoding/json/decode.go
Original file line number Diff line number Diff line change
Expand Up @@ -690,20 +690,9 @@ func (d *decodeState) object(v reflect.Value) error {
}
subv = mapElem
} else {
var f *field
if i, ok := fields.nameIndex[string(key)]; ok {
// Found an exact name match.
f = &fields.list[i]
} else {
// Fall back to the expensive case-insensitive
// linear search.
for i := range fields.list {
ff := &fields.list[i]
if ff.equalFold(ff.nameBytes, key) {
f = ff
break
}
}
f := fields.byExactName[string(key)]
if f == nil {
f = fields.byFoldedName[string(foldName(key))]
}
if f != nil {
subv = v
Expand Down
20 changes: 12 additions & 8 deletions src/encoding/json/encode.go
Original file line number Diff line number Diff line change
Expand Up @@ -672,8 +672,9 @@ type structEncoder struct {
}

type structFields struct {
list []field
nameIndex map[string]int
list []field
byExactName map[string]*field
byFoldedName map[string]*field
}

func (se structEncoder) encode(e *encodeState, v reflect.Value, opts encOpts) {
Expand Down Expand Up @@ -1033,8 +1034,7 @@ func appendString[Bytes []byte | string](dst []byte, src Bytes, escapeHTML bool)
// A field represents a single field found in a struct.
type field struct {
name string
nameBytes []byte // []byte(name)
equalFold func(s, t []byte) bool // bytes.EqualFold or equivalent
nameBytes []byte // []byte(name)

nameNonEsc string // `"` + name + `":`
nameEscHTML string // `"` + HTMLEscape(name) + `":`
Expand Down Expand Up @@ -1161,7 +1161,6 @@ func typeFields(t reflect.Type) structFields {
quoted: quoted,
}
field.nameBytes = []byte(field.name)
field.equalFold = foldFunc(field.nameBytes)

// Build nameEscHTML and nameNonEsc ahead of time.
nameEscBuf = appendHTMLEscape(nameEscBuf[:0], field.nameBytes)
Expand Down Expand Up @@ -1240,11 +1239,16 @@ func typeFields(t reflect.Type) structFields {
f := &fields[i]
f.encoder = typeEncoder(typeByIndex(t, f.index))
}
nameIndex := make(map[string]int, len(fields))
exactNameIndex := make(map[string]*field, len(fields))
foldedNameIndex := make(map[string]*field, len(fields))
for i, field := range fields {
nameIndex[field.name] = i
exactNameIndex[field.name] = &fields[i]
// For historical reasons, first folded match takes precedence.
if _, ok := foldedNameIndex[string(foldName(field.nameBytes))]; !ok {
foldedNameIndex[string(foldName(field.nameBytes))] = &fields[i]
}
}
return structFields{fields, nameIndex}
return structFields{fields, exactNameIndex, foldedNameIndex}
}

// dominantField looks through the fields, all of which are known to
Expand Down
147 changes: 27 additions & 120 deletions src/encoding/json/fold.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,137 +5,44 @@
package json

import (
"bytes"
"unicode"
"unicode/utf8"
)

const (
caseMask = ^byte(0x20) // Mask to ignore case in ASCII.
kelvin = '\u212a'
smallLongEss = '\u017f'
)

// foldFunc returns one of four different case folding equivalence
// functions, from most general (and slow) to fastest:
//
// 1) bytes.EqualFold, if the key s contains any non-ASCII UTF-8
// 2) equalFoldRight, if s contains special folding ASCII ('k', 'K', 's', 'S')
// 3) asciiEqualFold, no special, but includes non-letters (including _)
// 4) simpleLetterEqualFold, no specials, no non-letters.
//
// The letters S and K are special because they map to 3 runes, not just 2:
// - S maps to s and to U+017F 'ſ' Latin small letter long s
// - k maps to K and to U+212A 'K' Kelvin sign
//
// See https://play.golang.org/p/tTxjOc0OGo
//
// The returned function is specialized for matching against s and
// should only be given s. It's not curried for performance reasons.
func foldFunc(s []byte) func(s, t []byte) bool {
nonLetter := false
special := false // special letter
for _, b := range s {
if b >= utf8.RuneSelf {
return bytes.EqualFold
}
upper := b & caseMask
if upper < 'A' || upper > 'Z' {
nonLetter = true
} else if upper == 'K' || upper == 'S' {
// See above for why these letters are special.
special = true
}
}
if special {
return equalFoldRight
}
if nonLetter {
return asciiEqualFold
}
return simpleLetterEqualFold
// foldName returns a folded string such that foldName(x) == foldName(y)
// is identical to bytes.EqualFold(x, y).
func foldName(in []byte) []byte {
// This is inlinable to take advantage of "function outlining".
var arr [32]byte // large enough for most JSON names
return appendFoldedName(arr[:0], in)
}

// equalFoldRight is a specialization of bytes.EqualFold when s is
// known to be all ASCII (including punctuation), but contains an 's',
// 'S', 'k', or 'K', requiring a Unicode fold on the bytes in t.
// See comments on foldFunc.
func equalFoldRight(s, t []byte) bool {
for _, sb := range s {
if len(t) == 0 {
return false
}
tb := t[0]
if tb < utf8.RuneSelf {
if sb != tb {
sbUpper := sb & caseMask
if 'A' <= sbUpper && sbUpper <= 'Z' {
if sbUpper != tb&caseMask {
return false
}
} else {
return false
}
func appendFoldedName(out, in []byte) []byte {
for i := 0; i < len(in); {
// Handle single-byte ASCII.
if c := in[i]; c < utf8.RuneSelf {
if 'a' <= c && c <= 'z' {
c -= 'a' - 'A'
}
t = t[1:]
out = append(out, c)
i++
continue
}
// sb is ASCII and t is not. t must be either kelvin
// sign or long s; sb must be s, S, k, or K.
tr, size := utf8.DecodeRune(t)
switch sb {
case 's', 'S':
if tr != smallLongEss {
return false
}
case 'k', 'K':
if tr != kelvin {
return false
}
default:
return false
}
t = t[size:]

}
return len(t) == 0
}

// asciiEqualFold is a specialization of bytes.EqualFold for use when
// s is all ASCII (but may contain non-letters) and contains no
// special-folding letters.
// See comments on foldFunc.
func asciiEqualFold(s, t []byte) bool {
if len(s) != len(t) {
return false
}
for i, sb := range s {
tb := t[i]
if sb == tb {
continue
}
if ('a' <= sb && sb <= 'z') || ('A' <= sb && sb <= 'Z') {
if sb&caseMask != tb&caseMask {
return false
}
} else {
return false
}
// Handle multi-byte Unicode.
r, n := utf8.DecodeRune(in[i:])
out = utf8.AppendRune(out, foldRune(r))
i += n
}
return true
return out
}

// simpleLetterEqualFold is a specialization of bytes.EqualFold for
// use when s is all ASCII letters (no underscores, etc) and also
// doesn't contain 'k', 'K', 's', or 'S'.
// See comments on foldFunc.
func simpleLetterEqualFold(s, t []byte) bool {
if len(s) != len(t) {
return false
}
for i, b := range s {
if b&caseMask != t[i]&caseMask {
return false
// foldRune is returns the smallest rune for all runes in the same fold set.
func foldRune(r rune) rune {
for {
r2 := unicode.SimpleFold(r)
if r2 <= r {
return r2
}
r = r2
}
return true
}
132 changes: 36 additions & 96 deletions src/encoding/json/fold_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,105 +6,45 @@ package json

import (
"bytes"
"strings"
"testing"
"unicode/utf8"
)

var foldTests = []struct {
fn func(s, t []byte) bool
s, t string
want bool
}{
{equalFoldRight, "", "", true},
{equalFoldRight, "a", "a", true},
{equalFoldRight, "", "a", false},
{equalFoldRight, "a", "", false},
{equalFoldRight, "a", "A", true},
{equalFoldRight, "AB", "ab", true},
{equalFoldRight, "AB", "ac", false},
{equalFoldRight, "sbkKc", "ſbKKc", true},
{equalFoldRight, "SbKkc", "ſbKKc", true},
{equalFoldRight, "SbKkc", "ſbKK", false},
{equalFoldRight, "e", "é", false},
{equalFoldRight, "s", "S", true},

{simpleLetterEqualFold, "", "", true},
{simpleLetterEqualFold, "abc", "abc", true},
{simpleLetterEqualFold, "abc", "ABC", true},
{simpleLetterEqualFold, "abc", "ABCD", false},
{simpleLetterEqualFold, "abc", "xxx", false},

{asciiEqualFold, "a_B", "A_b", true},
{asciiEqualFold, "aa@", "aa`", false}, // verify 0x40 and 0x60 aren't case-equivalent
}

func TestFold(t *testing.T) {
for i, tt := range foldTests {
if got := tt.fn([]byte(tt.s), []byte(tt.t)); got != tt.want {
t.Errorf("%d. %q, %q = %v; want %v", i, tt.s, tt.t, got, tt.want)
}
truth := strings.EqualFold(tt.s, tt.t)
if truth != tt.want {
t.Errorf("strings.EqualFold doesn't agree with case %d", i)
}
func FuzzEqualFold(f *testing.F) {
for _, ss := range [][2]string{
{"", ""},
{"123abc", "123ABC"},
{"αβδ", "ΑΒΔ"},
{"abc", "xyz"},
{"abc", "XYZ"},
{"1", "2"},
{"hello, world!", "hello, world!"},
{"hello, world!", "Hello, World!"},
{"hello, world!", "HELLO, WORLD!"},
{"hello, world!", "jello, world!"},
{"γειά, κόσμε!", "γειά, κόσμε!"},
{"γειά, κόσμε!", "Γειά, Κόσμε!"},
{"γειά, κόσμε!", "ΓΕΙΆ, ΚΌΣΜΕ!"},
{"γειά, κόσμε!", "ΛΕΙΆ, ΚΌΣΜΕ!"},
{"AESKey", "aesKey"},
{"AESKEY", "aes_key"},
{"aes_key", "AES_KEY"},
{"AES_KEY", "aes-key"},
{"aes-key", "AES-KEY"},
{"AES-KEY", "aesKey"},
{"aesKey", "AesKey"},
{"AesKey", "AESKey"},
{"AESKey", "aeskey"},
{"DESKey", "aeskey"},
{"AES Key", "aeskey"},
} {
f.Add([]byte(ss[0]), []byte(ss[1]))
}
}

func TestFoldAgainstUnicode(t *testing.T) {
var buf1, buf2 []byte
var runes []rune
for i := 0x20; i <= 0x7f; i++ {
runes = append(runes, rune(i))
}
runes = append(runes, kelvin, smallLongEss)

funcs := []struct {
name string
fold func(s, t []byte) bool
letter bool // must be ASCII letter
simple bool // must be simple ASCII letter (not 'S' or 'K')
}{
{
name: "equalFoldRight",
fold: equalFoldRight,
},
{
name: "asciiEqualFold",
fold: asciiEqualFold,
simple: true,
},
{
name: "simpleLetterEqualFold",
fold: simpleLetterEqualFold,
simple: true,
letter: true,
},
}

for _, ff := range funcs {
for _, r := range runes {
if r >= utf8.RuneSelf {
continue
}
if ff.letter && !isASCIILetter(byte(r)) {
continue
}
if ff.simple && (r == 's' || r == 'S' || r == 'k' || r == 'K') {
continue
}
for _, r2 := range runes {
buf1 = append(utf8.AppendRune(append(buf1[:0], 'x'), r), 'x')
buf2 = append(utf8.AppendRune(append(buf2[:0], 'x'), r2), 'x')
want := bytes.EqualFold(buf1, buf2)
if got := ff.fold(buf1, buf2); got != want {
t.Errorf("%s(%q, %q) = %v; want %v", ff.name, buf1, buf2, got, want)
}
}
equalFold := func(x, y []byte) bool { return string(foldName(x)) == string(foldName(y)) }
f.Fuzz(func(t *testing.T, x, y []byte) {
got := equalFold(x, y)
want := bytes.EqualFold(x, y)
if got != want {
t.Errorf("equalFold(%q, %q) = %v, want %v", x, y, got, want)
}
}
}

func isASCIILetter(b byte) bool {
return ('A' <= b && b <= 'Z') || ('a' <= b && b <= 'z')
})
}

0 comments on commit b9b8cec

Please sign in to comment.