Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add avx512 implementation #79

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
module github.com/cespare/xxhash/v2

go 1.11

require (
github.com/klauspost/cpuid/v2 v2.2.8
golang.org/x/sys v0.15.0 // indirect
)
5 changes: 5 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
github.com/klauspost/cpuid/v2 v2.2.8 h1:+StwCXwm9PdpiEkPyzBXIy+M9KUb4ODm0Zarf1kS5BM=
github.com/klauspost/cpuid/v2 v2.2.8/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc=
golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
80 changes: 37 additions & 43 deletions xxhash.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,10 @@ var primes = [...]uint64{prime1, prime2, prime3, prime4, prime5}
// Note that a zero-valued Digest is not ready to receive writes.
// Call Reset or create a Digest using New before calling other methods.
type Digest struct {
v1 uint64
v2 uint64
v3 uint64
v4 uint64
s [4]uint64
total uint64
mem [32]byte
n int // how much of mem is used
n uint8 // how much of mem is used
}

// New creates a new Digest with a zero seed.
Expand All @@ -57,10 +54,10 @@ func (d *Digest) Reset() {
// ResetWithSeed clears the Digest's state so that it can be reused.
// It uses the given seed to initialize the state.
func (d *Digest) ResetWithSeed(seed uint64) {
d.v1 = seed + prime1 + prime2
d.v2 = seed + prime2
d.v3 = seed
d.v4 = seed - prime1
d.s[0] = seed + prime1 + prime2
d.s[1] = seed + prime2
d.s[2] = seed
d.s[3] = seed - prime1
d.total = 0
d.n = 0
}
Expand All @@ -76,35 +73,32 @@ func (d *Digest) Write(b []byte) (n int, err error) {
n = len(b)
d.total += uint64(n)

memleft := d.mem[d.n&(len(d.mem)-1):]

if d.n+n < 32 {
// This new data doesn't even fill the current block.
copy(memleft, b)
d.n += n
return
}

if d.n > 0 {
// Finish off the partial block.
c := copy(memleft, b)
d.v1 = round(d.v1, u64(d.mem[0:8]))
d.v2 = round(d.v2, u64(d.mem[8:16]))
d.v3 = round(d.v3, u64(d.mem[16:24]))
d.v4 = round(d.v4, u64(d.mem[24:32]))
b = b[c:]
var extra *[32]byte
if d.n != 0 {
// there is data already in mem, append to it.
added := copy(d.mem[d.n:], b)
b = b[added:]
d.n += uint8(added)
if uint(d.n) < uint(len(d.mem)) {
// not enough data to hash.
return
}
extra = &d.mem
d.n = 0
}

if len(b) >= 32 {
// One or more full blocks left.
nw := writeBlocks(d, b)
b = b[nw:]
writeBlocks(d, extra, b)
b = b[uint(len(b))&^31:]
} else if extra != nil {
// we don't have enough data to fill b but we have an extra.
// write blocks must never be called with len(b) < 32 so pass extra as b.
writeBlocks(d, nil, extra[:])
}

// Store any remaining partial block.
copy(d.mem[:], b)
d.n = len(b)
d.n = uint8(copy(d.mem[:], b))

return
}
Expand All @@ -130,19 +124,19 @@ func (d *Digest) Sum64() uint64 {
var h uint64

if d.total >= 32 {
v1, v2, v3, v4 := d.v1, d.v2, d.v3, d.v4
v1, v2, v3, v4 := d.s[0], d.s[1], d.s[2], d.s[3]
h = rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
h = mergeRound(h, v1)
h = mergeRound(h, v2)
h = mergeRound(h, v3)
h = mergeRound(h, v4)
} else {
h = d.v3 + prime5
h = d.s[2] + prime5
}

h += d.total

b := d.mem[:d.n&(len(d.mem)-1)]
b := d.mem[:d.n&uint8(len(d.mem)-1)]
for ; len(b) >= 8; b = b[8:] {
k1 := round(0, u64(b[:8]))
h ^= k1
Expand Down Expand Up @@ -176,13 +170,13 @@ const (
func (d *Digest) MarshalBinary() ([]byte, error) {
b := make([]byte, 0, marshaledSize)
b = append(b, magic...)
b = appendUint64(b, d.v1)
b = appendUint64(b, d.v2)
b = appendUint64(b, d.v3)
b = appendUint64(b, d.v4)
b = appendUint64(b, d.s[0])
b = appendUint64(b, d.s[1])
b = appendUint64(b, d.s[2])
b = appendUint64(b, d.s[3])
b = appendUint64(b, d.total)
b = append(b, d.mem[:d.n]...)
b = b[:len(b)+len(d.mem)-d.n]
b = b[:len(b)+len(d.mem)-int(d.n)]
return b, nil
}

Expand All @@ -195,13 +189,13 @@ func (d *Digest) UnmarshalBinary(b []byte) error {
return errors.New("xxhash: invalid hash state size")
}
b = b[len(magic):]
b, d.v1 = consumeUint64(b)
b, d.v2 = consumeUint64(b)
b, d.v3 = consumeUint64(b)
b, d.v4 = consumeUint64(b)
b, d.s[0] = consumeUint64(b)
b, d.s[1] = consumeUint64(b)
b, d.s[2] = consumeUint64(b)
b, d.s[3] = consumeUint64(b)
b, d.total = consumeUint64(b)
copy(d.mem[:], b)
d.n = int(d.total % uint64(len(d.mem)))
d.n = uint8(d.total % uint64(len(d.mem)))
return nil
}

Expand Down
51 changes: 29 additions & 22 deletions xxhash_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -40,25 +40,29 @@
IMULQ prime1, acc \
ADDQ prime4, acc

// blockLoop processes as many 32-byte blocks as possible,
// updating v1, v2, v3, and v4. It assumes that there is at least one block
// to process.
#define blockLoop() \
loop: \
// round32 perform a 32byte round loading from ptr on v1, v2, v3, v4.
#define round32() \
MOVQ +0(p), x \
round(v1, x) \
MOVQ +8(p), x \
round(v2, x) \
MOVQ +16(p), x \
round(v3, x) \
MOVQ +24(p), x \
round(v4, x) \
ADDQ $32, p \
CMPQ p, end \
round(v4, x)

// blockLoop processes as many 32-byte blocks as possible,
// updating v1, v2, v3, and v4. It assumes that there is at least one block
// to process.
#define blockLoop() \
loop: \
round32() \
ADDQ $32, p \
CMPQ p, end \
JLE loop

// func Sum64(b []byte) uint64
TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
// func sum64(b []byte) uint64
TEXT ·sum64Scalar(SB), NOSPLIT|NOFRAME, $0-32
// Load fixed primes.
MOVQ ·primes+0(SB), prime1
MOVQ ·primes+8(SB), prime2
Expand Down Expand Up @@ -173,25 +177,32 @@ finalize:
MOVQ h, ret+24(FP)
RET

// func writeBlocks(d *Digest, b []byte) int
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
// func writeBlocksScalar(d *Digest, extra *[32]byte, b []byte)
TEXT ·writeBlocksScalar(SB), NOSPLIT|NOFRAME, $0-40
// Load fixed primes needed for round.
MOVQ ·primes+0(SB), prime1
MOVQ ·primes+8(SB), prime2

// Load slice.
MOVQ b_base+8(FP), p
MOVQ b_len+16(FP), n
LEAQ (p)(n*1), end
SUBQ $32, end

// Load vN from d.
MOVQ s+0(FP), d
MOVQ 0(d), v1
MOVQ 8(d), v2
MOVQ 16(d), v3
MOVQ 24(d), v4

// Handle extra
MOVQ extra+8(FP), p
TESTQ p, p
JZ noExtra
round32()
noExtra:

// Load slice.
MOVQ b_base+16(FP), p
MOVQ b_len+24(FP), n
LEAQ (p)(n*1), end
SUBQ $32, end

// We don't need to check the loop condition here; this function is
// always called with at least one block of data to process.
blockLoop()
Expand All @@ -202,8 +213,4 @@ TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
MOVQ v3, 16(d)
MOVQ v4, 24(d)

// The number of bytes written is p minus the old base pointer.
SUBQ b_base+8(FP), p
MOVQ p, ret+32(FP)

RET
6 changes: 2 additions & 4 deletions xxhash_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,8 @@ finalize:
MOVD h, ret+24(FP)
RET

// func writeBlocks(d *Digest, b []byte) int
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
// func writeBlocksArm64(d *Digest, b []byte)
TEXT ·writeBlocksArm64(SB), NOSPLIT|NOFRAME, $0-32
LDP ·primes+0(SB), (prime1, prime2)

// Load state. Assume v[1-4] are stored contiguously.
Expand All @@ -178,6 +178,4 @@ TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
STP (v1, v2), 0(digest)
STP (v3, v4), 16(digest)

BIC $31, n
MOVD n, ret+32(FP)
RET
15 changes: 0 additions & 15 deletions xxhash_asm.go

This file was deleted.

31 changes: 31 additions & 0 deletions xxhash_asm_amd64.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
//go:build amd64 && !appengine && gc && !purego && go1.22
// +build amd64,!appengine,gc,!purego,go1.22

package xxhash

import "github.com/klauspost/cpuid/v2"

var useAvx512 = cpuid.CPU.Supports(
cpuid.AVX,
cpuid.AVX2,
cpuid.AVX512DQ,
cpuid.AVX512F,
cpuid.AVX512VL,
cpuid.BMI1,

// Today, vectorized 64 bits integer multiples positively sucks on intel,
// with ILP a single scalar unit is multiple times faster.
// This means sometime we wont be using the AVX512 when under virtualization
// because vendor will be the hypervisor, but in my experience that is rare.
// Most virtualization setups defaults to reporting the vendorid of the host.
) && cpuid.CPU.IsVendor(cpuid.AMD)

// Sum64 computes the 64-bit xxHash digest of b with a zero seed.
//
//go:noescape
func Sum64(b []byte) uint64

// extra is a first block before b, it may be nil then skip it.
//
//go:noescape
func writeBlocks(d *Digest, extra *[32]byte, b []byte)
22 changes: 22 additions & 0 deletions xxhash_asm_amd64_old.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//go:build amd64 && !appengine && gc && !purego && !go1.22
// +build amd64,!appengine,gc,!purego,!go1.22

// The avx512 impl relies on PCALIGN.

package xxhash

// Sum64 computes the 64-bit xxHash digest of b with a zero seed.
func Sum64(b []byte) uint64 {
return sum64Scalar(b)
}

//go:noescape
func sum64Scalar(b []byte) uint64

// extra is a first block before b, it may be nil then skip it.
func writeBlocks(d *Digest, extra *[32]byte, b []byte) {
return writeBlocksScalar(d, extra, b)
}

//go:noescape
func writeBlocksScalar(d *Digest, extra *[32]byte, b []byte)
26 changes: 26 additions & 0 deletions xxhash_asm_arm64.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
//go:build arm64 && !appengine && gc && !purego
// +build arm64,!appengine,gc,!purego

package xxhash

var useAvx512 = false // used in tests

// Sum64 computes the 64-bit xxHash digest of b with a zero seed.
//
//go:noescape
func Sum64(b []byte) uint64

// extra is a first block before b, it may be nil then skip it.
func writeBlocks(d *Digest, extra *[32]byte, b []byte) {
if extra != nil {
// FIXME: handle that logic in ASM, *someone* was lazy and didn't
// cared to learn the arm64 p9 syntax.
// At least this is hopefully on par with how fast the software impl
// it used to be.
writeBlocksArm64(d, extra[:])
}
writeBlocksArm64(d, b)
}

//go:noescape
func writeBlocksArm64(d *Digest, b []byte)
Loading