From c69ba3787e9ff5b2f45165cc55fb8e348d48262f Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Fri, 2 Dec 2022 16:16:11 +0100 Subject: [PATCH 1/2] zstd: Select best match using selection trees MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit name old speed new speed delta Encoder_EncodeAllSimple/best-8 12.2MB/s ± 1% 13.5MB/s ± 3% +10.55% (p=0.000 n=20+19) Encoder_EncodeAllSimple4K/best-8 10.5MB/s ± 1% 11.9MB/s ± 1% +13.52% (p=0.000 n=20+19) name old alloc/op new alloc/op delta Encoder_EncodeAllSimple/best-8 18.0B ± 0% 16.0B ± 0% -11.11% (p=0.000 n=18+17) Encoder_EncodeAllSimple4K/best-8 1.00B ± 0% 1.00B ± 0% ~ (all equal) --- zstd/enc_best.go | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/zstd/enc_best.go b/zstd/enc_best.go index 817df64e54..a54002ca2e 100644 --- a/zstd/enc_best.go +++ b/zstd/enc_best.go @@ -216,22 +216,26 @@ encodeLoop: return m } - best := bestOf(matchAt(candidateL.offset-e.cur, s, uint32(cv), -1), matchAt(candidateL.prev-e.cur, s, uint32(cv), -1)) - best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1)) - best = bestOf(best, matchAt(candidateS.prev-e.cur, s, uint32(cv), -1)) + m1 := matchAt(candidateL.offset-e.cur, s, uint32(cv), -1) + m2 := matchAt(candidateL.prev-e.cur, s, uint32(cv), -1) + m3 := matchAt(candidateS.offset-e.cur, s, uint32(cv), -1) + m4 := matchAt(candidateS.prev-e.cur, s, uint32(cv), -1) + best := bestOf(bestOf(m1, m2), bestOf(m3, m4)) if canRepeat && best.length < goodEnough { cv32 := uint32(cv >> 8) spp := s + 1 - best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1)) - best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2)) - best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3)) + m1 := matchAt(spp-offset1, spp, cv32, 1) + m2 := matchAt(spp-offset2, spp, cv32, 2) + m3 := matchAt(spp-offset3, spp, cv32, 3) + best = bestOf(bestOf(best, m1), bestOf(m2, m3)) if best.length > 0 { cv32 = uint32(cv >> 24) spp += 2 - best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1)) - best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2)) - best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3)) + m1 := matchAt(spp-offset1, spp, cv32, 1) + m2 := matchAt(spp-offset2, spp, cv32, 2) + m3 := matchAt(spp-offset3, spp, cv32, 3) + best = bestOf(bestOf(best, m1), bestOf(m2, m3)) } } // Load next and check... @@ -258,12 +262,13 @@ encodeLoop: candidateL2 := e.longTable[hashLen(cv2, bestLongTableBits, bestLongLen)] // Short at s+1 - best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1)) + m1 := matchAt(candidateS.offset-e.cur, s, uint32(cv), -1) // Long at s+1, s+2 - best = bestOf(best, matchAt(candidateL.offset-e.cur, s, uint32(cv), -1)) - best = bestOf(best, matchAt(candidateL.prev-e.cur, s, uint32(cv), -1)) - best = bestOf(best, matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1)) - best = bestOf(best, matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1)) + m2 := matchAt(candidateL.offset-e.cur, s, uint32(cv), -1) + m3 := matchAt(candidateL.prev-e.cur, s, uint32(cv), -1) + m4 := matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1) + m5 := matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1) + best = bestOf(bestOf(bestOf(best, m1), m2), bestOf(bestOf(m3, m4), m5)) if false { // Short at s+3. // Too often worse... From d1da464850c4b2e90ed1b3b9f1aba0b8f662c2ce Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Sat, 3 Dec 2022 07:58:31 +0100 Subject: [PATCH 2/2] zstd: Track best match in best encoder by pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As long as matchAt does not return a pointer, escape analysis determines that the matches can stay on the stack. This works in Go 1.17, too. name old speed new speed delta Encoder_EncodeAllSimple/best-8 13.5MB/s ± 3% 15.3MB/s ± 2% +13.35% (p=0.000 n=19+19) Encoder_EncodeAllSimple4K/best-8 11.9MB/s ± 1% 12.9MB/s ± 0% +8.38% (p=0.000 n=19+17) name old alloc/op new alloc/op delta Encoder_EncodeAllSimple/best-8 16.0B ± 0% 14.0B ± 0% -12.50% (p=0.000 n=17+20) Encoder_EncodeAllSimple4K/best-8 1.00B ± 0% 1.00B ± 0% ~ (all equal) --- zstd/enc_best.go | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/zstd/enc_best.go b/zstd/enc_best.go index a54002ca2e..830f5ba74a 100644 --- a/zstd/enc_best.go +++ b/zstd/enc_best.go @@ -189,7 +189,7 @@ encodeLoop: panic("offset0 was 0") } - bestOf := func(a, b match) match { + bestOf := func(a, b *match) *match { if a.est-b.est+(a.s-b.s)*bitsPerByte>>10 < 0 { return a } @@ -220,7 +220,7 @@ encodeLoop: m2 := matchAt(candidateL.prev-e.cur, s, uint32(cv), -1) m3 := matchAt(candidateS.offset-e.cur, s, uint32(cv), -1) m4 := matchAt(candidateS.prev-e.cur, s, uint32(cv), -1) - best := bestOf(bestOf(m1, m2), bestOf(m3, m4)) + best := bestOf(bestOf(&m1, &m2), bestOf(&m3, &m4)) if canRepeat && best.length < goodEnough { cv32 := uint32(cv >> 8) @@ -228,14 +228,14 @@ encodeLoop: m1 := matchAt(spp-offset1, spp, cv32, 1) m2 := matchAt(spp-offset2, spp, cv32, 2) m3 := matchAt(spp-offset3, spp, cv32, 3) - best = bestOf(bestOf(best, m1), bestOf(m2, m3)) + best = bestOf(bestOf(best, &m1), bestOf(&m2, &m3)) if best.length > 0 { cv32 = uint32(cv >> 24) spp += 2 m1 := matchAt(spp-offset1, spp, cv32, 1) m2 := matchAt(spp-offset2, spp, cv32, 2) m3 := matchAt(spp-offset3, spp, cv32, 3) - best = bestOf(bestOf(best, m1), bestOf(m2, m3)) + best = bestOf(bestOf(best, &m1), bestOf(&m2, &m3)) } } // Load next and check... @@ -268,11 +268,12 @@ encodeLoop: m3 := matchAt(candidateL.prev-e.cur, s, uint32(cv), -1) m4 := matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1) m5 := matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1) - best = bestOf(bestOf(bestOf(best, m1), m2), bestOf(bestOf(m3, m4), m5)) + best = bestOf(bestOf(bestOf(best, &m1), &m2), bestOf(bestOf(&m3, &m4), &m5)) if false { // Short at s+3. // Too often worse... - best = bestOf(best, matchAt(e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1)) + m := matchAt(e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1) + best = bestOf(best, &m) } // See if we can find a better match by checking where the current best ends. // Use that offset to see if we can find a better full match. @@ -283,9 +284,11 @@ encodeLoop: // For this compression level 2 yields the best results. const skipBeginning = 2 if pos := candidateEnd.offset - e.cur - best.length + skipBeginning; pos >= 0 { - bestEnd := bestOf(best, matchAt(pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)) + m := matchAt(pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1) + bestEnd := bestOf(best, &m) if pos := candidateEnd.prev - e.cur - best.length + skipBeginning; pos >= 0 { - bestEnd = bestOf(bestEnd, matchAt(pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)) + m := matchAt(pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1) + bestEnd = bestOf(bestEnd, &m) } best = bestEnd }