Skip to content

Commit

Permalink
manually inline functions in the slide
Browse files Browse the repository at this point in the history
This makes the slide faster than anything else on most inputs.
Extremely small inputs are slower but by little.
  • Loading branch information
Jorropo committed Jun 20, 2024
1 parent 907c96d commit a0584e2
Show file tree
Hide file tree
Showing 2 changed files with 2,797 additions and 815 deletions.
75 changes: 51 additions & 24 deletions gen/slide.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"bytes"
"fmt"
"os"
"strings"
)

const sumSlideSize = 127
Expand All @@ -17,6 +18,8 @@ func slide() error {
package xxhash
import "math/bits"
// Generated by gen/slide.go. DO NOT EDIT.
const slideLength = %d
Expand All @@ -38,6 +41,7 @@ func slide(b []byte) uint64 {
// This means I make very liberal use of goto, they shouldn't be red as JMPs but abstract basic blocks links.
// - The compiler has some SSA passes.
// This is used for all the b_* tricks.
// - We can't trust the compiler to inline anything (except intrinsics).
// Setup variables here since go doesn't want use to do dangerous gotos.
v1 := prime1
Expand Down Expand Up @@ -92,32 +96,28 @@ func slide(b []byte) uint64 {
fmt.Fprintf(w, `sz_%d:
{
b := b_%d[:]
load := uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 // Work around for go.dev/issue/68081.
b = b[8:]
v1 = round(v1, load)
load = uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 // Work around for go.dev/issue/68081.
b = b[8:]
v2 = round(v2, load)
load = uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 // Work around for go.dev/issue/68081.
var load uint64
`, i, i)
for r := 1; r <= 4; r++ {
fmt.Fprintf(w, ` load = %s
b = b[8:]
v3 = round(v3, load)
load = uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 // Work around for go.dev/issue/68081.
v4 = round(v4, load)
b_%d = (*[%d]byte)(b_%d[32:])
`, load64)
genRound(w, 2, fmt.Sprintf("v%d", r), "load")
}
fmt.Fprintf(w, `b_%d = (*[%d]byte)(b_%d[32:])
}
`, i, i, i-32, i-32, i)
`, i-32, i-32, i)
// POTENTIAL OPTIMIZATION: b[32:] creates an addition to bump the pointer which means the address dependency on the memory loads is not resolved before the jmp table. I know two fixes:
// - change b to a pointer to the end of the slice and subtract the total offset. I don't know how to do this in pure go.
// - don't bother reusing the slides, this means each load instruction can hardcode the offset. Make the code significantly bigger and i-cache worst, altho I didn't tried it.
}
w.WriteString(` h = rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
h = mergeRound(h, v1)
h = mergeRound(h, v2)
h = mergeRound(h, v3)
h = mergeRound(h, v4)
w.WriteString(` h = bits.RotateLeft64(v1, 1) + bits.RotateLeft64(v2, 7) + bits.RotateLeft64(v3, 12) + bits.RotateLeft64(v4, 18)
`)
for r := 1; r <= 4; r++ {
genMergeRound(w, 1, "h", fmt.Sprintf("v%d", r))
}
w.WriteString("\n")
if i != 0 { // Avoid « label sz_0 defined and not used », case 0 shortcuts with a precomputed value.
fmt.Fprintf(w, "sz_%d:\n", i)
}
Expand All @@ -134,13 +134,16 @@ func slide(b []byte) uint64 {
fmt.Fprintf(w, `sz_%dl:
{
b := b_%d[:]
load := uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 // Work around for go.dev/issue/68081.
h ^= round(0, load)
h = rol27(h)*prime1 + prime4
load := %s
var temp uint64
`, i, i, load64)
genRound(w, 2, "temp", "load")
fmt.Fprintf(w, `h ^= temp
h = bits.RotateLeft64(h, 27)*prime1 + prime4
b_%d = (*[%d]byte)(b_%d[8:])
}
`, i, i, i-8, i-8, i)
`, i-8, i-8, i)
}
fmt.Fprintf(w, `goto sz_%dl
Expand All @@ -155,7 +158,7 @@ func slide(b []byte) uint64 {
b := b_%d[:]
load := uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 // Work around for go.dev/issue/68081.
h ^= uint64(load) * prime1
h = rol23(h)*prime2 + prime3
h = bits.RotateLeft64(h, 23)*prime2 + prime3
b_%d = (*[%d]byte)(b_%d[4:])
goto sz_%dl
}
Expand All @@ -168,7 +171,7 @@ func slide(b []byte) uint64 {
i--
fmt.Fprintf(w, `sz_%dl:
h ^= uint64(b_%d[0]) * prime5
h = rol11(h) * prime1
h = bits.RotateLeft64(h, 11) * prime1
b_%d = (*[%d]byte)(b_%d[1:])
`, i, i, i-1, i-1, i)
Expand Down Expand Up @@ -207,3 +210,27 @@ sz_0l:

return nil
}

func genRound(w *bytes.Buffer, tab uint, acc, input string) {
tabs := strings.Repeat("\t", int(tab))
fmt.Fprintf(w, `%s%s += %s * prime2
%s%s = bits.RotateLeft64(%s, 31)
%s%s *= prime1
`, tabs, acc, input, tabs, acc, acc, tabs, acc)
}

func genMergeRound(w *bytes.Buffer, tab uint, acc, val string) {
tabs := strings.Repeat("\t", int(tab))
tab++
tabsp := strings.Repeat("\t", int(tab))
fmt.Fprintf(w, `%s{
%svar temp uint64
`, tabs, tabsp)
genRound(w, tab, "temp", val)
fmt.Fprintf(w, `%s%s ^= temp
%s%s = %s*prime1 + prime4
%s}
`, tabsp, acc, tabsp, acc, acc, tabs)
}

const load64 = "uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 // Work around for go.dev/issue/68081."
Loading

0 comments on commit a0584e2

Please sign in to comment.