From 756e2f0c70de1acfe00b0bab5166e527ec0a2187 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Sun, 28 May 2023 00:52:36 +0800 Subject: [PATCH] improve unsafe Decompression Performance ~4% This is another attempt to replace the aggressive compiler after the failed attempt #69 (wrote out of bounds in some cases) The unrolling is avoided by manually unrolling less aggressive. Decompression performance is slightly improved by ca 4%, except the smallest test case. --- src/block/decompress.rs | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/block/decompress.rs b/src/block/decompress.rs index e7048a59..d71dd938 100644 --- a/src/block/decompress.rs +++ b/src/block/decompress.rs @@ -64,13 +64,24 @@ unsafe fn duplicate_overlapping( // This is the same strategy used by the reference C implementation https://github.com/lz4/lz4/pull/772 output_ptr.write(0u8); let dst_ptr_end = output_ptr.add(match_length); - while *output_ptr < dst_ptr_end { - // Note that we copy 4 bytes, instead of one. + + while output_ptr.add(1) < dst_ptr_end { + // Note that this loop unrolling is done, so that the compiler doesn't do it in a awful + // way. // Without that the compiler will unroll/auto-vectorize the copy with a lot of branches. // This is not what we want, as large overlapping copies are not that common. core::ptr::copy(start, *output_ptr, 1); start = start.add(1); *output_ptr = output_ptr.add(1); + + core::ptr::copy(start, *output_ptr, 1); + start = start.add(1); + *output_ptr = output_ptr.add(1); + } + + if *output_ptr < dst_ptr_end { + core::ptr::copy(start, *output_ptr, 1); + *output_ptr = output_ptr.add(1); } }