cespare · Jorropo · May 8, 2024 · May 8, 2024 · May 8, 2024 · Jun 18, 2024
diff --git a/go.mod b/go.mod
@@ -1,3 +1,8 @@
 module github.com/cespare/xxhash/v2
 
 go 1.11
+
+require (
+	github.com/klauspost/cpuid/v2 v2.2.8
+	golang.org/x/sys v0.15.0 // indirect
+)
diff --git a/go.sum b/go.sum
@@ -0,0 +1,5 @@
+github.com/klauspost/cpuid/v2 v2.2.8 h1:+StwCXwm9PdpiEkPyzBXIy+M9KUb4ODm0Zarf1kS5BM=
+github.com/klauspost/cpuid/v2 v2.2.8/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc=
+golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
diff --git a/xxhash.go b/xxhash.go
@@ -27,13 +27,10 @@ var primes = [...]uint64{prime1, prime2, prime3, prime4, prime5}
 // Note that a zero-valued Digest is not ready to receive writes.
 // Call Reset or create a Digest using New before calling other methods.
 type Digest struct {
-	v1    uint64
-	v2    uint64
-	v3    uint64
-	v4    uint64
+	s     [4]uint64
 	total uint64
 	mem   [32]byte
-	n     int // how much of mem is used
+	n     uint8 // how much of mem is used
 }
 
 // New creates a new Digest with a zero seed.
@@ -57,10 +54,10 @@ func (d *Digest) Reset() {
 // ResetWithSeed clears the Digest's state so that it can be reused.
 // It uses the given seed to initialize the state.
 func (d *Digest) ResetWithSeed(seed uint64) {
-	d.v1 = seed + prime1 + prime2
-	d.v2 = seed + prime2
-	d.v3 = seed
-	d.v4 = seed - prime1
+	d.s[0] = seed + prime1 + prime2
+	d.s[1] = seed + prime2
+	d.s[2] = seed
+	d.s[3] = seed - prime1
 	d.total = 0
 	d.n = 0
 }
@@ -76,35 +73,32 @@ func (d *Digest) Write(b []byte) (n int, err error) {
 	n = len(b)
 	d.total += uint64(n)
 
-	memleft := d.mem[d.n&(len(d.mem)-1):]
-
-	if d.n+n < 32 {
-		// This new data doesn't even fill the current block.
-		copy(memleft, b)
-		d.n += n
-		return
-	}
-
-	if d.n > 0 {
-		// Finish off the partial block.
-		c := copy(memleft, b)
-		d.v1 = round(d.v1, u64(d.mem[0:8]))
-		d.v2 = round(d.v2, u64(d.mem[8:16]))
-		d.v3 = round(d.v3, u64(d.mem[16:24]))
-		d.v4 = round(d.v4, u64(d.mem[24:32]))
-		b = b[c:]
+	var extra *[32]byte
+	if d.n != 0 {
+		// there is data already in mem, append to it.
+		added := copy(d.mem[d.n:], b)
+		b = b[added:]
+		d.n += uint8(added)
+		if uint(d.n) < uint(len(d.mem)) {
+			// not enough data to hash.
+			return
+		}
+		extra = &d.mem
 		d.n = 0
 	}
 
 	if len(b) >= 32 {
 		// One or more full blocks left.
-		nw := writeBlocks(d, b)
-		b = b[nw:]
+		writeBlocks(d, extra, b)
+		b = b[uint(len(b))&^31:]
+	} else if extra != nil {
+		// we don't have enough data to fill b but we have an extra.
+		// write blocks must never be called with len(b) < 32 so pass extra as b.
+		writeBlocks(d, nil, extra[:])
 	}
 
 	// Store any remaining partial block.
-	copy(d.mem[:], b)
-	d.n = len(b)
+	d.n = uint8(copy(d.mem[:], b))
 
 	return
 }
@@ -130,19 +124,19 @@ func (d *Digest) Sum64() uint64 {
 	var h uint64
 
 	if d.total >= 32 {
-		v1, v2, v3, v4 := d.v1, d.v2, d.v3, d.v4
+		v1, v2, v3, v4 := d.s[0], d.s[1], d.s[2], d.s[3]
 		h = rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
 		h = mergeRound(h, v1)
 		h = mergeRound(h, v2)
 		h = mergeRound(h, v3)
 		h = mergeRound(h, v4)
 	} else {
-		h = d.v3 + prime5
+		h = d.s[2] + prime5
 	}
 
 	h += d.total
 
-	b := d.mem[:d.n&(len(d.mem)-1)]
+	b := d.mem[:d.n&uint8(len(d.mem)-1)]
 	for ; len(b) >= 8; b = b[8:] {
 		k1 := round(0, u64(b[:8]))
 		h ^= k1
@@ -176,13 +170,13 @@ const (
 func (d *Digest) MarshalBinary() ([]byte, error) {
 	b := make([]byte, 0, marshaledSize)
 	b = append(b, magic...)
-	b = appendUint64(b, d.v1)
-	b = appendUint64(b, d.v2)
-	b = appendUint64(b, d.v3)
-	b = appendUint64(b, d.v4)
+	b = appendUint64(b, d.s[0])
+	b = appendUint64(b, d.s[1])
+	b = appendUint64(b, d.s[2])
+	b = appendUint64(b, d.s[3])
 	b = appendUint64(b, d.total)
 	b = append(b, d.mem[:d.n]...)
-	b = b[:len(b)+len(d.mem)-d.n]
+	b = b[:len(b)+len(d.mem)-int(d.n)]
 	return b, nil
 }
 
@@ -195,13 +189,13 @@ func (d *Digest) UnmarshalBinary(b []byte) error {
 		return errors.New("xxhash: invalid hash state size")
 	}
 	b = b[len(magic):]
-	b, d.v1 = consumeUint64(b)
-	b, d.v2 = consumeUint64(b)
-	b, d.v3 = consumeUint64(b)
-	b, d.v4 = consumeUint64(b)
+	b, d.s[0] = consumeUint64(b)
+	b, d.s[1] = consumeUint64(b)
+	b, d.s[2] = consumeUint64(b)
+	b, d.s[3] = consumeUint64(b)
 	b, d.total = consumeUint64(b)
 	copy(d.mem[:], b)
-	d.n = int(d.total % uint64(len(d.mem)))
+	d.n = uint8(d.total % uint64(len(d.mem)))
 	return nil
 }
 

diff --git a/xxhash_amd64.s b/xxhash_amd64.s
@@ -40,25 +40,29 @@
 	IMULQ prime1, acc \
 	ADDQ  prime4, acc
 
-// blockLoop processes as many 32-byte blocks as possible,
-// updating v1, v2, v3, and v4. It assumes that there is at least one block
-// to process.
-#define blockLoop() \
-loop:  \
+// round32 perform a 32byte round loading from ptr on v1, v2, v3, v4.
+#define round32() \
 	MOVQ +0(p), x  \
 	round(v1, x)   \
 	MOVQ +8(p), x  \
 	round(v2, x)   \
 	MOVQ +16(p), x \
 	round(v3, x)   \
 	MOVQ +24(p), x \
-	round(v4, x)   \
-	ADDQ $32, p    \
-	CMPQ p, end    \
+	round(v4, x)
+
+// blockLoop processes as many 32-byte blocks as possible,
+// updating v1, v2, v3, and v4. It assumes that there is at least one block
+// to process.
+#define blockLoop() \
+loop:  \
+	round32()   \
+	ADDQ $32, p \
+	CMPQ p, end \
 	JLE  loop
 
-// func Sum64(b []byte) uint64
-TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
+// func sum64(b []byte) uint64
+TEXT ·sum64Scalar(SB), NOSPLIT|NOFRAME, $0-32
 	// Load fixed primes.
 	MOVQ ·primes+0(SB), prime1
 	MOVQ ·primes+8(SB), prime2
@@ -173,25 +177,32 @@ finalize:
 	MOVQ h, ret+24(FP)
 	RET
 
-// func writeBlocks(d *Digest, b []byte) int
-TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
+// func writeBlocksScalar(d *Digest, extra *[32]byte, b []byte)
+TEXT ·writeBlocksScalar(SB), NOSPLIT|NOFRAME, $0-40
 	// Load fixed primes needed for round.
 	MOVQ ·primes+0(SB), prime1
 	MOVQ ·primes+8(SB), prime2
 
-	// Load slice.
-	MOVQ b_base+8(FP), p
-	MOVQ b_len+16(FP), n
-	LEAQ (p)(n*1), end
-	SUBQ $32, end
-
 	// Load vN from d.
 	MOVQ s+0(FP), d
 	MOVQ 0(d), v1
 	MOVQ 8(d), v2
 	MOVQ 16(d), v3
 	MOVQ 24(d), v4
 
+	// Handle extra
+	MOVQ extra+8(FP), p
+	TESTQ p, p
+	JZ	noExtra
+		round32()
+noExtra:
+
+	// Load slice.
+	MOVQ b_base+16(FP), p
+	MOVQ b_len+24(FP), n
+	LEAQ (p)(n*1), end
+	SUBQ $32, end
+
 	// We don't need to check the loop condition here; this function is
 	// always called with at least one block of data to process.
 	blockLoop()
@@ -202,8 +213,4 @@ TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
 	MOVQ v3, 16(d)
 	MOVQ v4, 24(d)
 
-	// The number of bytes written is p minus the old base pointer.
-	SUBQ b_base+8(FP), p
-	MOVQ p, ret+32(FP)
-
 	RET
diff --git a/xxhash_arm64.s b/xxhash_arm64.s
@@ -161,8 +161,8 @@ finalize:
 	MOVD h, ret+24(FP)
 	RET
 
-// func writeBlocks(d *Digest, b []byte) int
-TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
+// func writeBlocksArm64(d *Digest, b []byte)
+TEXT ·writeBlocksArm64(SB), NOSPLIT|NOFRAME, $0-32
 	LDP ·primes+0(SB), (prime1, prime2)
 
 	// Load state. Assume v[1-4] are stored contiguously.
@@ -178,6 +178,4 @@ TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
 	STP (v1, v2), 0(digest)
 	STP (v3, v4), 16(digest)
 
-	BIC  $31, n
-	MOVD n, ret+32(FP)
 	RET
diff --git a/xxhash_asm.go b/xxhash_asm.go
diff --git a/xxhash_asm_amd64.go b/xxhash_asm_amd64.go
@@ -0,0 +1,31 @@
+//go:build amd64 && !appengine && gc && !purego && go1.22
+// +build amd64,!appengine,gc,!purego,go1.22
+
+package xxhash
+
+import "github.com/klauspost/cpuid/v2"
+
+var useAvx512 = cpuid.CPU.Supports(
+	cpuid.AVX,
+	cpuid.AVX2,
+	cpuid.AVX512DQ,
+	cpuid.AVX512F,
+	cpuid.AVX512VL,
+	cpuid.BMI1,
+
+// Today, vectorized 64 bits integer multiples positively sucks on intel,
+// with ILP a single scalar unit is multiple times faster.
+// This means sometime we wont be using the AVX512 when under virtualization
+// because vendor will be the hypervisor, but in my experience that is rare.
+// Most virtualization setups defaults to reporting the vendorid of the host.
+) && cpuid.CPU.IsVendor(cpuid.AMD)
+
+// Sum64 computes the 64-bit xxHash digest of b with a zero seed.
+//
+//go:noescape
+func Sum64(b []byte) uint64
+
+// extra is a first block before b, it may be nil then skip it.
+//
+//go:noescape
+func writeBlocks(d *Digest, extra *[32]byte, b []byte)
diff --git a/xxhash_asm_amd64_old.go b/xxhash_asm_amd64_old.go
@@ -0,0 +1,22 @@
+//go:build amd64 && !appengine && gc && !purego && !go1.22
+// +build amd64,!appengine,gc,!purego,!go1.22
+
+// The avx512 impl relies on PCALIGN.
+
+package xxhash
+
+// Sum64 computes the 64-bit xxHash digest of b with a zero seed.
+func Sum64(b []byte) uint64 {
+	return sum64Scalar(b)
+}
+
+//go:noescape
+func sum64Scalar(b []byte) uint64
+
+// extra is a first block before b, it may be nil then skip it.
+func writeBlocks(d *Digest, extra *[32]byte, b []byte) {
+	return writeBlocksScalar(d, extra, b)
+}
+
+//go:noescape
+func writeBlocksScalar(d *Digest, extra *[32]byte, b []byte)
diff --git a/xxhash_asm_arm64.go b/xxhash_asm_arm64.go
@@ -0,0 +1,26 @@
+//go:build arm64 && !appengine && gc && !purego
+// +build arm64,!appengine,gc,!purego
+
+package xxhash
+
+var useAvx512 = false // used in tests
+
+// Sum64 computes the 64-bit xxHash digest of b with a zero seed.
+//
+//go:noescape
+func Sum64(b []byte) uint64
+
+// extra is a first block before b, it may be nil then skip it.
+func writeBlocks(d *Digest, extra *[32]byte, b []byte) {
+	if extra != nil {
+		// FIXME: handle that logic in ASM, *someone* was lazy and didn't
+		// cared to learn the arm64 p9 syntax.
+		// At least this is hopefully on par with how fast the software impl
+		// it used to be.
+		writeBlocksArm64(d, extra[:])
+	}
+	writeBlocksArm64(d, b)
+}
+
+//go:noescape
+func writeBlocksArm64(d *Digest, b []byte)