From 8dc885906ee5dce97bfbcea705de28900f1c732b Mon Sep 17 00:00:00 2001 From: Jakub Sztandera Date: Sun, 6 Oct 2019 12:28:43 +0200 Subject: [PATCH 1/8] Implement buzhash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It has the same properites as Rabin. Benchmark results: ``` name time/op Buzhash-4 14.3ms ± 7% Rabin-4 94.1ms ± 3% Default-4 1.74ms ± 7% name speed Buzhash-4 1.18GB/s ± 7% Rabin-4 178MB/s ± 3% Default-4 9.63GB/s ± 6% name alloc/op Buzhash-4 14.0kB ±48% Rabin-4 19.2MB ± 0% Default-4 474B ± 6% name allocs/op Buzhash-4 1.00 ± 0% Rabin-4 196 ±12% Default-4 2.00 ± 0% ``` License: MIT Signed-off-by: Jakub Sztandera --- buzhash.go | 115 ++++++++++++++++++++++++++++++++++++++++++++++++ buzhash_test.go | 94 +++++++++++++++++++++++++++++++++++++++ gen/main.go | 33 ++++++++++++++ rabin_test.go | 25 ++++++++--- 4 files changed, 261 insertions(+), 6 deletions(-) create mode 100644 buzhash.go create mode 100644 buzhash_test.go create mode 100644 gen/main.go diff --git a/buzhash.go b/buzhash.go new file mode 100644 index 0000000..dbec13c --- /dev/null +++ b/buzhash.go @@ -0,0 +1,115 @@ +package chunk + +import ( + "io" + "math/bits" + + pool "github.com/libp2p/go-buffer-pool" +) + +const ( + buzMin = 128 << 10 + buzMax = 512 << 10 + buzMask = 1<<17 - 1 +) + +type Buzhash struct { + r io.Reader + buf []byte + n int + + err error +} + +func NewBuzhash(r io.Reader) *Buzhash { + return &Buzhash{ + r: r, + buf: pool.Get(buzMax), + } +} + +func (b *Buzhash) NextBytes() ([]byte, error) { + if b.err != nil { + return nil, b.err + } + + buf := b.buf + n, err := io.ReadFull(b.r, buf[b.n:]) + if err != nil { + if err == io.ErrUnexpectedEOF { + b.err = io.EOF + return buf[:n+b.n], nil + } else { + b.err = err + pool.Put(buf) + return nil, err + } + } + + i := buzMin - 32 + + var state uint32 = 0 + + for ; i < buzMin; i++ { + state = bits.RotateLeft32(state, 1) + state = state ^ bytehash[buf[i]] + } + + for ; state&buzMask != 0; i++ { + if i >= buzMax { + break + } + state = bits.RotateLeft32(state, 1) ^ bytehash[buf[i-32]] ^ bytehash[buf[i]] + } + + res := buf[:i] + b.buf = pool.Get(buzMax) + b.n = copy(b.buf, buf[i:]) + + return res, nil +} + +var bytehash = [256]uint32{ + 0x6236e7d5, 0x10279b0b, 0x72818182, 0xdc526514, 0x2fd41e3d, 0x777ef8c8, + 0x83ee5285, 0x2c8f3637, 0x2f049c1a, 0x57df9791, 0x9207151f, 0x9b544818, + 0x74eef658, 0x2028ca60, 0x0271d91a, 0x27ae587e, 0xecf9fa5f, 0x236e71cd, + 0xf43a8a2e, 0xbb13380, 0x9e57912c, 0x89a26cdb, 0x9fcf3d71, 0xa86da6f1, + 0x9c49f376, 0x346aecc7, 0xf094a9ee, 0xea99e9cb, 0xb01713c6, 0x88acffb, + 0x2960a0fb, 0x344a626c, 0x7ff22a46, 0x6d7a1aa5, 0x6a714916, 0x41d454ca, + 0x8325b830, 0xb65f563, 0x447fecca, 0xf9d0ea5e, 0xc1d9d3d4, 0xcb5ec574, + 0x55aae902, 0x86edc0e7, 0xd3a9e33, 0xe70dc1e1, 0xe3c5f639, 0x9b43140a, + 0xc6490ac5, 0x5e4030fb, 0x8e976dd5, 0xa87468ea, 0xf830ef6f, 0xcc1ed5a5, + 0x611f4e78, 0xddd11905, 0xf2613904, 0x566c67b9, 0x905a5ccc, 0x7b37b3a4, + 0x4b53898a, 0x6b8fd29d, 0xaad81575, 0x511be414, 0x3cfac1e7, 0x8029a179, + 0xd40efeda, 0x7380e02, 0xdc9beffd, 0x2d049082, 0x99bc7831, 0xff5002a8, + 0x21ce7646, 0x1cd049b, 0xf43994f, 0xc3c6c5a5, 0xbbda5f50, 0xec15ec7, + 0x9adb19b6, 0xc1e80b9, 0xb9b52968, 0xae162419, 0x2542b405, 0x91a42e9d, + 0x6be0f668, 0x6ed7a6b9, 0xbc2777b4, 0xe162ce56, 0x4266aad5, 0x60fdb704, + 0x66f832a5, 0x9595f6ca, 0xfee83ced, 0x55228d99, 0x12bf0e28, 0x66896459, + 0x789afda, 0x282baa8, 0x2367a343, 0x591491b0, 0x2ff1a4b1, 0x410739b6, + 0x9b7055a0, 0x2e0eb229, 0x24fc8252, 0x3327d3df, 0xb0782669, 0x1c62e069, + 0x7f503101, 0xf50593ae, 0xd9eb275d, 0xe00eb678, 0x5917ccde, 0x97b9660a, + 0xdd06202d, 0xed229e22, 0xa9c735bf, 0xd6316fe6, 0x6fc72e4c, 0x206dfa2, + 0xd6b15c5a, 0x69d87b49, 0x9c97745, 0x13445d61, 0x35a975aa, 0x859aa9b9, + 0x65380013, 0xd1fb6391, 0xc29255fd, 0x784a3b91, 0xb9e74c26, 0x63ce4d40, + 0xc07cbe9e, 0xe6e4529e, 0xfb3632f, 0x9438d9c9, 0x682f94a8, 0xf8fd4611, + 0x257ec1ed, 0x475ce3d6, 0x60ee2db1, 0x2afab002, 0x2b9e4878, 0x86b340de, + 0x1482fdca, 0xfe41b3bf, 0xd4a412b0, 0xe09db98c, 0xc1af5d53, 0x7e55e25f, + 0xd3346b38, 0xb7a12cbd, 0x9c6827ba, 0x71f78bee, 0x8c3a0f52, 0x150491b0, + 0xf26de912, 0x233e3a4e, 0xd309ebba, 0xa0a9e0ff, 0xca2b5921, 0xeeb9893c, + 0x33829e88, 0x9870cc2a, 0x23c4b9d0, 0xeba32ea3, 0xbdac4d22, 0x3bc8c44c, + 0x1e8d0397, 0xf9327735, 0x783b009f, 0xeb83742, 0x2621dc71, 0xed017d03, + 0x5c760aa1, 0x5a69814b, 0x96e3047f, 0xa93c9cde, 0x615c86f5, 0xb4322aa5, + 0x4225534d, 0xd2e2de3, 0xccfccc4b, 0xbac2a57, 0xf0a06d04, 0xbc78d737, + 0xf2d1f766, 0xf5a7953c, 0xbcdfda85, 0x5213b7d5, 0xbce8a328, 0xd38f5f18, + 0xdb094244, 0xfe571253, 0x317fa7ee, 0x4a324f43, 0x3ffc39d9, 0x51b3fa8e, + 0x7a4bee9f, 0x78bbc682, 0x9f5c0350, 0x2fe286c, 0x245ab686, 0xed6bf7d7, + 0xac4988a, 0x3fe010fa, 0xc65fe369, 0xa45749cb, 0x2b84e537, 0xde9ff363, + 0x20540f9a, 0xaa8c9b34, 0x5bc476b3, 0x1d574bd7, 0x929100ad, 0x4721de4d, + 0x27df1b05, 0x58b18546, 0xb7e76764, 0xdf904e58, 0x97af57a1, 0xbd4dc433, + 0xa6256dfd, 0xf63998f3, 0xf1e05833, 0xe20acf26, 0xf57fd9d6, 0x90300b4d, + 0x89df4290, 0x68d01cbc, 0xcf893ee3, 0xcc42a046, 0x778e181b, 0x67265c76, + 0xe981a4c4, 0x82991da1, 0x708f7294, 0xe6e2ae62, 0xfc441870, 0x95e1b0b6, + 0x445f825, 0x5a93b47f, 0x5e9cf4be, 0x84da71e7, 0x9d9582b0, 0x9bf835ef, + 0x591f61e2, 0x43325985, 0x5d2de32e, 0x8d8fbf0f, 0x95b30f38, 0x7ad5b6e, + 0x4e934edf, 0x3cd4990e, 0x9053e259, 0x5c41857d} diff --git a/buzhash_test.go b/buzhash_test.go new file mode 100644 index 0000000..15b46b7 --- /dev/null +++ b/buzhash_test.go @@ -0,0 +1,94 @@ +package chunk + +import ( + "bytes" + "fmt" + "io" + "testing" + + util "github.com/ipfs/go-ipfs-util" + pool "github.com/libp2p/go-buffer-pool" +) + +func TestBuzhashChunking(t *testing.T) { + data := make([]byte, 1024*1024*16) + util.NewTimeSeededRand().Read(data) + + r := NewBuzhash(bytes.NewReader(data)) + + var chunks [][]byte + + for { + chunk, err := r.NextBytes() + if err != nil { + if err == io.EOF { + break + } + t.Fatal(err) + } + + chunks = append(chunks, chunk) + } + + t.Logf("average block size: %d\n", len(data)/len(chunks)) + + unchunked := bytes.Join(chunks, nil) + if !bytes.Equal(unchunked, data) { + fmt.Printf("%d %d\n", len(unchunked), len(data)) + //ioutil.WriteFile("./incorrect", unchunked, 0777) + //ioutil.WriteFile("./correct", data, 0777) + t.Fatal("data was chunked incorrectly") + } +} + +func TestBuzhashChunkReuse(t *testing.T) { + newBuzhash := func(r io.Reader) cher { + return NewBuzhash(r) + } + testReuse(t, newBuzhash) +} + +func BenchmarkBuzhash(b *testing.B) { + data := make([]byte, 16<<20) + util.NewTimeSeededRand().Read(data) + + b.SetBytes(16 << 20) + b.ReportAllocs() + b.ResetTimer() + + var res uint64 + + for i := 0; i < b.N; i++ { + r := NewBuzhash(bytes.NewReader(data)) + + for { + chunk, err := r.NextBytes() + if err != nil { + if err == io.EOF { + break + } + b.Fatal(err) + } + res = res + uint64(len(chunk)) + pool.Put(chunk) + } + } + Res = Res + res +} + +func TestBuzhashBitsHash(t *testing.T) { + counts := make([]byte, 32) + for _, h := range bytehash { + for i := 0; i < 32; i++ { + if h&1 == 1 { + counts[i]++ + } + h = h >> 1 + } + } + for i, c := range counts { + if c != 128 { + t.Errorf("Bit balance in position %d broken, %d ones", i, c) + } + } +} diff --git a/gen/main.go b/gen/main.go new file mode 100644 index 0000000..9d90854 --- /dev/null +++ b/gen/main.go @@ -0,0 +1,33 @@ +// This file generates bytehash LUT +package main + +import ( + "fmt" + "math/rand" +) + +const nRounds = 200 + +func main() { + rnd := rand.New(rand.NewSource(0)) + + lut := make([]uint32, 256) + for i := 0; i < 256/2; i++ { + lut[i] = 1<<32 - 1 + } + + for r := 0; r < nRounds; r++ { + for b := uint32(0); b < 32; b++ { + mask := uint32(1) << b + nmask := ^mask + for i, j := range rnd.Perm(256) { + li := lut[i] + lj := lut[j] + lut[i] = li&nmask | (lj & mask) + lut[j] = lj&nmask | (li & mask) + } + } + } + + fmt.Printf("%#v", lut) +} diff --git a/rabin_test.go b/rabin_test.go index 140c0c4..7aa8a13 100644 --- a/rabin_test.go +++ b/rabin_test.go @@ -39,8 +39,14 @@ func TestRabinChunking(t *testing.T) { } } -func chunkData(t *testing.T, data []byte) map[string]blocks.Block { - r := NewRabin(bytes.NewReader(data), 1024*256) +type cher interface { + NextBytes() ([]byte, error) +} + +type newChunker func(io.Reader) cher + +func chunkData(t *testing.T, newC newChunker, data []byte) map[string]blocks.Block { + r := newC(bytes.NewReader(data)) blkmap := make(map[string]blocks.Block) @@ -60,12 +66,12 @@ func chunkData(t *testing.T, data []byte) map[string]blocks.Block { return blkmap } -func TestRabinChunkReuse(t *testing.T) { +func testReuse(t *testing.T, cr newChunker) { data := make([]byte, 1024*1024*16) util.NewTimeSeededRand().Read(data) - ch1 := chunkData(t, data[1000:]) - ch2 := chunkData(t, data) + ch1 := chunkData(t, cr, data[1000:]) + ch2 := chunkData(t, cr, data) var extra int for k := range ch2 { @@ -76,8 +82,15 @@ func TestRabinChunkReuse(t *testing.T) { } if extra > 2 { - t.Log("too many spare chunks made") + t.Logf("too many spare chunks made: %d", extra) + } +} + +func TestRabinChunkReuse(t *testing.T) { + newRabin := func(r io.Reader) cher { + return NewRabin(r, 256*1024) } + testReuse(t, newRabin) } var Res uint64 From 29aeb2c8a218c8b1478500b8d6f60f8a14cad487 Mon Sep 17 00:00:00 2001 From: Jakub Sztandera Date: Sun, 6 Oct 2019 13:53:24 +0200 Subject: [PATCH 2/8] Combine if conditions License: MIT Signed-off-by: Jakub Sztandera --- buzhash.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/buzhash.go b/buzhash.go index dbec13c..4bbe009 100644 --- a/buzhash.go +++ b/buzhash.go @@ -55,10 +55,7 @@ func (b *Buzhash) NextBytes() ([]byte, error) { state = state ^ bytehash[buf[i]] } - for ; state&buzMask != 0; i++ { - if i >= buzMax { - break - } + for ; state&buzMask != 0 && i < buzMax; i++ { state = bits.RotateLeft32(state, 1) ^ bytehash[buf[i-32]] ^ bytehash[buf[i]] } From 9e34967aef4741b48009afa8d40e31981880ad83 Mon Sep 17 00:00:00 2001 From: Jakub Sztandera Date: Sun, 6 Oct 2019 13:56:15 +0200 Subject: [PATCH 3/8] Invalidate buf pointer after returning it to the pool License: MIT Signed-off-by: Jakub Sztandera --- buzhash.go | 1 + 1 file changed, 1 insertion(+) diff --git a/buzhash.go b/buzhash.go index 4bbe009..40b2e93 100644 --- a/buzhash.go +++ b/buzhash.go @@ -42,6 +42,7 @@ func (b *Buzhash) NextBytes() ([]byte, error) { } else { b.err = err pool.Put(buf) + b.buf = nil return nil, err } } From b1c398a9fac50efee5232df2c9a09ba6da7af448 Mon Sep 17 00:00:00 2001 From: Jakub Sztandera Date: Sun, 6 Oct 2019 23:34:06 +0200 Subject: [PATCH 4/8] Don't use pools for result buffers Don't return them either in benchmarks License: MIT Signed-off-by: Jakub Sztandera --- buzhash.go | 12 +++++++++--- buzhash_test.go | 6 ++---- rabin_test.go | 5 +++-- splitting_test.go | 7 +++---- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/buzhash.go b/buzhash.go index 40b2e93..edfc4c0 100644 --- a/buzhash.go +++ b/buzhash.go @@ -38,7 +38,12 @@ func (b *Buzhash) NextBytes() ([]byte, error) { if err != nil { if err == io.ErrUnexpectedEOF { b.err = io.EOF - return buf[:n+b.n], nil + res := make([]byte, n+b.n) + copy(res, buf) + + pool.Put(b.buf) + b.buf = nil + return res, nil } else { b.err = err pool.Put(buf) @@ -60,8 +65,9 @@ func (b *Buzhash) NextBytes() ([]byte, error) { state = bits.RotateLeft32(state, 1) ^ bytehash[buf[i-32]] ^ bytehash[buf[i]] } - res := buf[:i] - b.buf = pool.Get(buzMax) + res := make([]byte, i) + copy(res, b.buf) + b.n = copy(b.buf, buf[i:]) return res, nil diff --git a/buzhash_test.go b/buzhash_test.go index 15b46b7..8e2b166 100644 --- a/buzhash_test.go +++ b/buzhash_test.go @@ -7,7 +7,6 @@ import ( "testing" util "github.com/ipfs/go-ipfs-util" - pool "github.com/libp2p/go-buffer-pool" ) func TestBuzhashChunking(t *testing.T) { @@ -49,10 +48,10 @@ func TestBuzhashChunkReuse(t *testing.T) { } func BenchmarkBuzhash(b *testing.B) { - data := make([]byte, 16<<20) + data := make([]byte, 1<<10) util.NewTimeSeededRand().Read(data) - b.SetBytes(16 << 20) + b.SetBytes(int64(len(data))) b.ReportAllocs() b.ResetTimer() @@ -70,7 +69,6 @@ func BenchmarkBuzhash(b *testing.B) { b.Fatal(err) } res = res + uint64(len(chunk)) - pool.Put(chunk) } } Res = Res + res diff --git a/rabin_test.go b/rabin_test.go index 7aa8a13..9eb8c66 100644 --- a/rabin_test.go +++ b/rabin_test.go @@ -96,10 +96,11 @@ func TestRabinChunkReuse(t *testing.T) { var Res uint64 func BenchmarkRabin(b *testing.B) { - data := make([]byte, 16<<20) + const size = 1 << 10 + data := make([]byte, size) util.NewTimeSeededRand().Read(data) - b.SetBytes(16 << 20) + b.SetBytes(size) b.ReportAllocs() b.ResetTimer() diff --git a/splitting_test.go b/splitting_test.go index 27afe59..f2de774 100644 --- a/splitting_test.go +++ b/splitting_test.go @@ -7,7 +7,6 @@ import ( u "github.com/ipfs/go-ipfs-util" util "github.com/ipfs/go-ipfs-util" - pool "github.com/libp2p/go-buffer-pool" ) func randBuf(t *testing.T, size int) []byte { @@ -122,10 +121,11 @@ func (s *clipReader) Read(buf []byte) (int, error) { } func BenchmarkDefault(b *testing.B) { - data := make([]byte, 16<<20) + const size = 1 << 10 + data := make([]byte, size) util.NewTimeSeededRand().Read(data) - b.SetBytes(16 << 20) + b.SetBytes(size) b.ReportAllocs() b.ResetTimer() @@ -143,7 +143,6 @@ func BenchmarkDefault(b *testing.B) { b.Fatal(err) } res = res + uint64(len(chunk)) - pool.Put(chunk) } } Res = Res + res From 25cb45d1068c0b537f335900e897721c72c10920 Mon Sep 17 00:00:00 2001 From: Jakub Sztandera Date: Mon, 7 Oct 2019 01:21:46 +0200 Subject: [PATCH 5/8] Improve benchmarks License: MIT Signed-off-by: Jakub Sztandera --- benchmark_test.go | 59 +++++++++++++++++++++++++++++++++++++++++++++++ buzhash.go | 4 ++++ buzhash_test.go | 31 ++++--------------------- rabin_test.go | 40 +++++--------------------------- splitting_test.go | 29 +++-------------------- 5 files changed, 77 insertions(+), 86 deletions(-) create mode 100644 benchmark_test.go diff --git a/benchmark_test.go b/benchmark_test.go new file mode 100644 index 0000000..5069b06 --- /dev/null +++ b/benchmark_test.go @@ -0,0 +1,59 @@ +package chunk + +import ( + "bytes" + "io" + "math/rand" + "testing" +) + +type newSplitter func(io.Reader) Splitter + +type bencSpec struct { + size int + name string +} + +var bSizes = []bencSpec{ + {1 << 10, "1K"}, + {1 << 20, "1M"}, + {16 << 20, "16M"}, + {100 << 20, "100M"}, +} + +func benchmarkChunker(b *testing.B, ns newSplitter) { + for _, s := range bSizes { + s := s + b.Run(s.name, func(b *testing.B) { + benchmarkChunkerSize(b, ns, s.size) + }) + } +} + +func benchmarkChunkerSize(b *testing.B, ns newSplitter, size int) { + rng := rand.New(rand.NewSource(1)) + data := make([]byte, size) + rng.Read(data) + + b.SetBytes(int64(size)) + b.ReportAllocs() + b.ResetTimer() + + var res uint64 + + for i := 0; i < b.N; i++ { + r := ns(bytes.NewReader(data)) + + for { + chunk, err := r.NextBytes() + if err != nil { + if err == io.EOF { + break + } + b.Fatal(err) + } + res = res + uint64(len(chunk)) + } + } + Res = Res + res +} diff --git a/buzhash.go b/buzhash.go index edfc4c0..099b723 100644 --- a/buzhash.go +++ b/buzhash.go @@ -28,6 +28,10 @@ func NewBuzhash(r io.Reader) *Buzhash { } } +func (b *Buzhash) Reader() io.Reader { + return b.r +} + func (b *Buzhash) NextBytes() ([]byte, error) { if b.err != nil { return nil, b.err diff --git a/buzhash_test.go b/buzhash_test.go index 8e2b166..6e59d6b 100644 --- a/buzhash_test.go +++ b/buzhash_test.go @@ -41,37 +41,16 @@ func TestBuzhashChunking(t *testing.T) { } func TestBuzhashChunkReuse(t *testing.T) { - newBuzhash := func(r io.Reader) cher { + newBuzhash := func(r io.Reader) Splitter { return NewBuzhash(r) } testReuse(t, newBuzhash) } -func BenchmarkBuzhash(b *testing.B) { - data := make([]byte, 1<<10) - util.NewTimeSeededRand().Read(data) - - b.SetBytes(int64(len(data))) - b.ReportAllocs() - b.ResetTimer() - - var res uint64 - - for i := 0; i < b.N; i++ { - r := NewBuzhash(bytes.NewReader(data)) - - for { - chunk, err := r.NextBytes() - if err != nil { - if err == io.EOF { - break - } - b.Fatal(err) - } - res = res + uint64(len(chunk)) - } - } - Res = Res + res +func BenchmarkBuzhash2(b *testing.B) { + benchmarkChunker(b, func(r io.Reader) Splitter { + return NewBuzhash(r) + }) } func TestBuzhashBitsHash(t *testing.T) { diff --git a/rabin_test.go b/rabin_test.go index 9eb8c66..857e97c 100644 --- a/rabin_test.go +++ b/rabin_test.go @@ -39,13 +39,7 @@ func TestRabinChunking(t *testing.T) { } } -type cher interface { - NextBytes() ([]byte, error) -} - -type newChunker func(io.Reader) cher - -func chunkData(t *testing.T, newC newChunker, data []byte) map[string]blocks.Block { +func chunkData(t *testing.T, newC newSplitter, data []byte) map[string]blocks.Block { r := newC(bytes.NewReader(data)) blkmap := make(map[string]blocks.Block) @@ -66,7 +60,7 @@ func chunkData(t *testing.T, newC newChunker, data []byte) map[string]blocks.Blo return blkmap } -func testReuse(t *testing.T, cr newChunker) { +func testReuse(t *testing.T, cr newSplitter) { data := make([]byte, 1024*1024*16) util.NewTimeSeededRand().Read(data) @@ -87,7 +81,7 @@ func testReuse(t *testing.T, cr newChunker) { } func TestRabinChunkReuse(t *testing.T) { - newRabin := func(r io.Reader) cher { + newRabin := func(r io.Reader) Splitter { return NewRabin(r, 256*1024) } testReuse(t, newRabin) @@ -96,29 +90,7 @@ func TestRabinChunkReuse(t *testing.T) { var Res uint64 func BenchmarkRabin(b *testing.B) { - const size = 1 << 10 - data := make([]byte, size) - util.NewTimeSeededRand().Read(data) - - b.SetBytes(size) - b.ReportAllocs() - b.ResetTimer() - - var res uint64 - - for i := 0; i < b.N; i++ { - r := NewRabin(bytes.NewReader(data), 1024*256) - - for { - chunk, err := r.NextBytes() - if err != nil { - if err == io.EOF { - break - } - b.Fatal(err) - } - res = res + uint64(len(chunk)) - } - } - Res = Res + res + benchmarkChunker(b, func(r io.Reader) Splitter { + return NewRabin(r, 256<<10) + }) } diff --git a/splitting_test.go b/splitting_test.go index f2de774..d6498fc 100644 --- a/splitting_test.go +++ b/splitting_test.go @@ -6,7 +6,6 @@ import ( "testing" u "github.com/ipfs/go-ipfs-util" - util "github.com/ipfs/go-ipfs-util" ) func randBuf(t *testing.T, size int) []byte { @@ -121,29 +120,7 @@ func (s *clipReader) Read(buf []byte) (int, error) { } func BenchmarkDefault(b *testing.B) { - const size = 1 << 10 - data := make([]byte, size) - util.NewTimeSeededRand().Read(data) - - b.SetBytes(size) - b.ReportAllocs() - b.ResetTimer() - - var res uint64 - - for i := 0; i < b.N; i++ { - r := DefaultSplitter(bytes.NewReader(data)) - - for { - chunk, err := r.NextBytes() - if err != nil { - if err == io.EOF { - break - } - b.Fatal(err) - } - res = res + uint64(len(chunk)) - } - } - Res = Res + res + benchmarkChunker(b, func(r io.Reader) Splitter { + return DefaultSplitter(r) + }) } From 5ac3082b5ba63dd0dd1cdac6dc2d532d86d83b26 Mon Sep 17 00:00:00 2001 From: Jakub Sztandera Date: Mon, 7 Oct 2019 11:49:08 +0200 Subject: [PATCH 6/8] Do not exit if buffer is not full. License: MIT Signed-off-by: Jakub Sztandera License: MIT Signed-off-by: Jakub Sztandera --- buzhash.go | 26 ++++++++++++++++---------- buzhash_test.go | 2 +- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/buzhash.go b/buzhash.go index 099b723..1a9b747 100644 --- a/buzhash.go +++ b/buzhash.go @@ -40,14 +40,16 @@ func (b *Buzhash) NextBytes() ([]byte, error) { buf := b.buf n, err := io.ReadFull(b.r, buf[b.n:]) if err != nil { - if err == io.ErrUnexpectedEOF { - b.err = io.EOF - res := make([]byte, n+b.n) - copy(res, buf) - - pool.Put(b.buf) - b.buf = nil - return res, nil + if err == io.ErrUnexpectedEOF || err == io.EOF { + if b.n+n < buzMin { + b.err = io.EOF + res := make([]byte, b.n+n) + copy(res, buf) + + pool.Put(b.buf) + b.buf = nil + return res, nil + } } else { b.err = err pool.Put(buf) @@ -65,14 +67,18 @@ func (b *Buzhash) NextBytes() ([]byte, error) { state = state ^ bytehash[buf[i]] } - for ; state&buzMask != 0 && i < buzMax; i++ { + if b.n+n > len(buf) { + panic("this is impossible, but gives +9 to performance") + } + + for ; state&buzMask != 0 && i < b.n+n; i++ { state = bits.RotateLeft32(state, 1) ^ bytehash[buf[i-32]] ^ bytehash[buf[i]] } res := make([]byte, i) copy(res, b.buf) - b.n = copy(b.buf, buf[i:]) + b.n = copy(b.buf, buf[i:b.n+n]) return res, nil } diff --git a/buzhash_test.go b/buzhash_test.go index 6e59d6b..f630cef 100644 --- a/buzhash_test.go +++ b/buzhash_test.go @@ -53,7 +53,7 @@ func BenchmarkBuzhash2(b *testing.B) { }) } -func TestBuzhashBitsHash(t *testing.T) { +func TestBuzhashBitsHashBias(t *testing.T) { counts := make([]byte, 32) for _, h := range bytehash { for i := 0; i < 32; i++ { From 462a6eb4ba7783e59120cd9656dae712fd12218f Mon Sep 17 00:00:00 2001 From: Jakub Sztandera Date: Mon, 7 Oct 2019 11:53:44 +0200 Subject: [PATCH 7/8] Cleanup buf usage License: MIT Signed-off-by: Jakub Sztandera --- buzhash.go | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/buzhash.go b/buzhash.go index 1a9b747..54115d5 100644 --- a/buzhash.go +++ b/buzhash.go @@ -37,14 +37,13 @@ func (b *Buzhash) NextBytes() ([]byte, error) { return nil, b.err } - buf := b.buf - n, err := io.ReadFull(b.r, buf[b.n:]) + n, err := io.ReadFull(b.r, b.buf[b.n:]) if err != nil { if err == io.ErrUnexpectedEOF || err == io.EOF { if b.n+n < buzMin { b.err = io.EOF res := make([]byte, b.n+n) - copy(res, buf) + copy(res, b.buf) pool.Put(b.buf) b.buf = nil @@ -52,7 +51,7 @@ func (b *Buzhash) NextBytes() ([]byte, error) { } } else { b.err = err - pool.Put(buf) + pool.Put(b.buf) b.buf = nil return nil, err } @@ -64,21 +63,21 @@ func (b *Buzhash) NextBytes() ([]byte, error) { for ; i < buzMin; i++ { state = bits.RotateLeft32(state, 1) - state = state ^ bytehash[buf[i]] + state = state ^ bytehash[b.buf[i]] } - if b.n+n > len(buf) { + if b.n+n > len(b.buf) { panic("this is impossible, but gives +9 to performance") } for ; state&buzMask != 0 && i < b.n+n; i++ { - state = bits.RotateLeft32(state, 1) ^ bytehash[buf[i-32]] ^ bytehash[buf[i]] + state = bits.RotateLeft32(state, 1) ^ bytehash[b.buf[i-32]] ^ bytehash[b.buf[i]] } res := make([]byte, i) copy(res, b.buf) - b.n = copy(b.buf, buf[i:b.n+n]) + b.n = copy(b.buf, b.buf[i:b.n+n]) return res, nil } From 57fa65915807a087b69fa6c7e3442cbe56413960 Mon Sep 17 00:00:00 2001 From: Jakub Sztandera Date: Mon, 7 Oct 2019 12:01:18 +0200 Subject: [PATCH 8/8] Add buzhash to parsers list License: MIT Signed-off-by: Jakub Sztandera --- parse.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/parse.go b/parse.go index af0a31e..5d472b7 100644 --- a/parse.go +++ b/parse.go @@ -14,8 +14,8 @@ var ( ) // FromString returns a Splitter depending on the given string: -// it supports "default" (""), "size-{size}", "rabin", "rabin-{blocksize}" and -// "rabin-{min}-{avg}-{max}". +// it supports "default" (""), "size-{size}", "rabin", "rabin-{blocksize}", +// "rabin-{min}-{avg}-{max}" and "buzhash". func FromString(r io.Reader, chunker string) (Splitter, error) { switch { case chunker == "" || chunker == "default": @@ -34,6 +34,9 @@ func FromString(r io.Reader, chunker string) (Splitter, error) { case strings.HasPrefix(chunker, "rabin"): return parseRabinString(r, chunker) + case chunker == "buzhash": + return NewBuzhash(r), nil + default: return nil, fmt.Errorf("unrecognized chunker option: %s", chunker) }