manually inline functions in the slide

This makes the slide faster than anything else on most inputs. Extremely small inputs are slower but by little.
cespare · Jun 20, 2024 · a0584e2 · a0584e2
1 parent 907c96d
commit a0584e2
Show file tree

Hide file tree

Showing 2 changed files with 2,797 additions and 815 deletions.
diff --git a/gen/slide.go b/gen/slide.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"fmt"
 	"os"
+	"strings"
 )
 
 const sumSlideSize = 127
@@ -17,6 +18,8 @@ func slide() error {
 
 package xxhash
 
+import "math/bits"
+
 // Generated by gen/slide.go. DO NOT EDIT.
 
 const slideLength = %d
@@ -38,6 +41,7 @@ func slide(b []byte) uint64 {
 	//   This means I make very liberal use of goto, they shouldn't be red as JMPs but abstract basic blocks links.
 	// - The compiler has some SSA passes.
 	//   This is used for all the b_* tricks.
+	// - We can't trust the compiler to inline anything (except intrinsics).
 
 	// Setup variables here since go doesn't want use to do dangerous gotos.
 	v1 := prime1
@@ -92,32 +96,28 @@ func slide(b []byte) uint64 {
 			fmt.Fprintf(w, `sz_%d:
 	{
 		b := b_%d[:]
-		load := uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 // Work around for go.dev/issue/68081.
-		b = b[8:]
-		v1 = round(v1, load)
-		load = uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 // Work around for go.dev/issue/68081.
-		b = b[8:]
-		v2 = round(v2, load)
-		load = uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 // Work around for go.dev/issue/68081.
+		var load uint64
+`, i, i)
+			for r := 1; r <= 4; r++ {
+				fmt.Fprintf(w, `		load = %s
 		b = b[8:]
-		v3 = round(v3, load)
-		load = uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 // Work around for go.dev/issue/68081.
-		v4 = round(v4, load)
-		b_%d = (*[%d]byte)(b_%d[32:])
+`, load64)
+				genRound(w, 2, fmt.Sprintf("v%d", r), "load")
+			}
+			fmt.Fprintf(w, `b_%d = (*[%d]byte)(b_%d[32:])
 	}
 
-`, i, i, i-32, i-32, i)
+`, i-32, i-32, i)
 			// POTENTIAL OPTIMIZATION: b[32:] creates an addition to bump the pointer which means the address dependency on the memory loads is not resolved before the jmp table. I know two fixes:
 			// - change b to a pointer to the end of the slice and subtract the total offset. I don't know how to do this in pure go.
 			// - don't bother reusing the slides, this means each load instruction can hardcode the offset. Make the code significantly bigger and i-cache worst, altho I didn't tried it.
 		}
-		w.WriteString(`	h = rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
-	h = mergeRound(h, v1)
-	h = mergeRound(h, v2)
-	h = mergeRound(h, v3)
-	h = mergeRound(h, v4)
-
+		w.WriteString(`	h = bits.RotateLeft64(v1, 1) + bits.RotateLeft64(v2, 7) + bits.RotateLeft64(v3, 12) + bits.RotateLeft64(v4, 18)
 `)
+		for r := 1; r <= 4; r++ {
+			genMergeRound(w, 1, "h", fmt.Sprintf("v%d", r))
+		}
+		w.WriteString("\n")
 		if i != 0 { // Avoid « label sz_0 defined and not used », case 0 shortcuts with a precomputed value.
 			fmt.Fprintf(w, "sz_%d:\n", i)
 		}
@@ -134,13 +134,16 @@ func slide(b []byte) uint64 {
 			fmt.Fprintf(w, `sz_%dl:
 	{
 		b := b_%d[:]
-		load := uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 // Work around for go.dev/issue/68081.
-		h ^= round(0, load)
-		h = rol27(h)*prime1 + prime4
+		load := %s
+		var temp uint64
+`, i, i, load64)
+			genRound(w, 2, "temp", "load")
+			fmt.Fprintf(w, `h ^= temp
+		h = bits.RotateLeft64(h, 27)*prime1 + prime4
 		b_%d = (*[%d]byte)(b_%d[8:])
 	}
 
-`, i, i, i-8, i-8, i)
+`, i-8, i-8, i)
 		}
 		fmt.Fprintf(w, `goto sz_%dl
 
@@ -155,7 +158,7 @@ func slide(b []byte) uint64 {
 		b := b_%d[:]
 		load := uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 // Work around for go.dev/issue/68081.
 		h ^= uint64(load) * prime1
-		h = rol23(h)*prime2 + prime3
+		h = bits.RotateLeft64(h, 23)*prime2 + prime3
 		b_%d = (*[%d]byte)(b_%d[4:])
 		goto sz_%dl
 	}
@@ -168,7 +171,7 @@ func slide(b []byte) uint64 {
 		i--
 		fmt.Fprintf(w, `sz_%dl:
 	h ^= uint64(b_%d[0]) * prime5
-	h = rol11(h) * prime1
+	h = bits.RotateLeft64(h, 11) * prime1
 	b_%d = (*[%d]byte)(b_%d[1:])
 
 `, i, i, i-1, i-1, i)
@@ -207,3 +210,27 @@ sz_0l:
 
 	return nil
 }
+
+func genRound(w *bytes.Buffer, tab uint, acc, input string) {
+	tabs := strings.Repeat("\t", int(tab))
+	fmt.Fprintf(w, `%s%s += %s * prime2
+%s%s = bits.RotateLeft64(%s, 31)
+%s%s *= prime1
+`, tabs, acc, input, tabs, acc, acc, tabs, acc)
+}
+
+func genMergeRound(w *bytes.Buffer, tab uint, acc, val string) {
+	tabs := strings.Repeat("\t", int(tab))
+	tab++
+	tabsp := strings.Repeat("\t", int(tab))
+	fmt.Fprintf(w, `%s{
+%svar temp uint64
+`, tabs, tabsp)
+	genRound(w, tab, "temp", val)
+	fmt.Fprintf(w, `%s%s ^= temp
+%s%s = %s*prime1 + prime4
+%s}
+`, tabsp, acc, tabsp, acc, acc, tabs)
+}
+
+const load64 = "uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 // Work around for go.dev/issue/68081."