improve safe Decompression 3-28%

imrove safe Decompression by 3-28% improve safe Compression by 0-16% Replace calls to memcpy with custom function
PSeitz · May 27, 2023 · 4d36f98 · 4d36f98
1 parent 320279f
commit 4d36f98
Show file tree

Hide file tree

Showing 4 changed files with 152 additions and 8 deletions.
diff --git a/src/block/compress.rs b/src/block/compress.rs
@@ -562,12 +562,7 @@ fn push_u32(output: &mut impl Sink, el: u32) {
 #[inline(always)] // (always) necessary otherwise compiler fails to inline it
 #[cfg(feature = "safe-encode")]
 fn copy_literals_wild(output: &mut impl Sink, input: &[u8], input_start: usize, len: usize) {
-    match len {
-        0..=8 => output.extend_from_slice_wild(&input[input_start..input_start + 8], len),
-        9..=16 => output.extend_from_slice_wild(&input[input_start..input_start + 16], len),
-        17..=24 => output.extend_from_slice_wild(&input[input_start..input_start + 24], len),
-        _ => output.extend_from_slice_wild(&input[input_start..input_start + len], len),
-    }
+    output.extend_from_slice_wild(&input[input_start..input_start + len], len)
 }
 
 #[inline]

diff --git a/src/fastcpy.rs b/src/fastcpy.rs
@@ -0,0 +1,145 @@
+//! # FastCpy
+//!
+//! The Rust Compiler calls `memcpy` for slices of unknown length.
+//! This crate provides a faster implementation of `memcpy` for slices up to 32bytes (64bytes with `avx`).
+//! If you know most of you copy operations are not too big you can use `fastcpy` to speed up your program.
+//!
+//! `fastcpy` is designed to contain not too much assembly, so the overhead is low.
+//!
+//! As fall back the standard `memcpy` is called
+//!
+//! ## Double Copy Trick
+//! `fastcpy` employs a double copy trick to copy slices of length 4-32bytes (64bytes with `avx`).
+//! E.g. Slice of length 6 can be copied with two uncoditional copy operations.
+//!
+//! /// [1, 2, 3, 4, 5, 6]
+//! /// [1, 2, 3, 4]
+//! ///       [3, 4, 5, 6]
+//!
+
+#[inline]
+pub fn slice_copy(src: &[u8], dst: &mut [u8]) {
+    #[inline(never)]
+    #[cold]
+    #[track_caller]
+    fn len_mismatch_fail(dst_len: usize, src_len: usize) -> ! {
+        panic!(
+            "source slice length ({}) does not match destination slice length ({})",
+            src_len, dst_len,
+        );
+    }
+
+    if src.len() != dst.len() {
+        len_mismatch_fail(src.len(), dst.len());
+    }
+    let len = src.len();
+
+    if src.is_empty() {
+        return;
+    }
+
+    if len < 4 {
+        short_copy(src, dst);
+        return;
+    }
+
+    if len < 8 {
+        double_copy_trick::<4>(src, dst);
+        return;
+    }
+
+    if len <= 16 {
+        double_copy_trick::<8>(src, dst);
+        return;
+    }
+
+    if len <= 32 {
+        double_copy_trick::<16>(src, dst);
+        return;
+    }
+
+    /// The code will use the vmovdqu instruction to copy 32 bytes at a time.
+    #[cfg(target_feature = "avx")]
+    {
+        if len <= 64 {
+            double_copy_trick::<32>(src, dst);
+            return;
+        }
+    }
+
+    // For larger sizes we use the default, which calls memcpy
+    // memcpy does some virtual memory tricks to copy large chunks of memory.
+    //
+    // The theory should be that the checks above don't cost much relative to the copy call for
+    // larger copies.
+    // The bounds checks in `copy_from_slice` are elided.
+    dst.copy_from_slice(src);
+}
+
+#[inline(always)]
+fn short_copy(src: &[u8], dst: &mut [u8]) {
+    let len = src.len();
+
+    // length 1-3
+    dst[0] = src[0];
+    if len >= 2 {
+        double_copy_trick::<2>(src, dst);
+    }
+}
+
+#[inline(always)]
+/// [1, 2, 3, 4, 5, 6]
+/// [1, 2, 3, 4]
+///       [3, 4, 5, 6]
+fn double_copy_trick<const SIZE: usize>(src: &[u8], dst: &mut [u8]) {
+    dst[0..SIZE].copy_from_slice(&src[0..SIZE]);
+    dst[src.len() - SIZE..].copy_from_slice(&src[src.len() - SIZE..]);
+}
+
+#[cfg(test)]
+mod tests {
+    use super::slice_copy;
+    use proptest::prelude::*;
+
+    proptest! {
+        #[test]
+        fn test_fast_short_slice_copy(left: Vec<u8>) {
+            let mut right = vec![0u8; left.len()];
+            slice_copy(&left, &mut right);
+            prop_assert_eq!(&left, &right);
+        }
+    }
+
+    #[test]
+    fn test_fast_short_slice_copy_edge_cases() {
+        for len in 0..(512 * 2) {
+            let left = (0..len).map(|i| i as u8).collect::<Vec<_>>();
+            let mut right = vec![0u8; len];
+            slice_copy(&left, &mut right);
+            assert_eq!(left, right);
+        }
+    }
+
+    #[test]
+    fn test_fail2() {
+        let left = vec![
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let mut right = vec![0u8; left.len()];
+        slice_copy(&left, &mut right);
+        assert_eq!(left, right);
+    }
+
+    #[test]
+    fn test_fail() {
+        let left = vec![
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        ];
+        let mut right = vec![0u8; left.len()];
+        slice_copy(&left, &mut right);
+        assert_eq!(left, right);
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -89,8 +89,10 @@ pub mod block;
 #[cfg_attr(docsrs, doc(cfg(feature = "frame")))]
 pub mod frame;
 
-pub use block::{compress, compress_into, compress_prepend_size};
+#[allow(dead_code)]
+mod fastcpy;
 
+pub use block::{compress, compress_into, compress_prepend_size};
 pub use block::{decompress, decompress_into, decompress_size_prepended};
 
 #[cfg_attr(

diff --git a/src/sink.rs b/src/sink.rs
@@ -1,6 +1,8 @@
 #[allow(unused_imports)]
 use alloc::vec::Vec;
 
+use crate::fastcpy::slice_copy;
+
 /// Returns a Sink implementation appropriate for outputing up to `required_capacity`
 /// bytes at `vec[offset..offset+required_capacity]`.
 /// It can be either a `SliceSink` (pre-filling the vec with zeroes if necessary)
@@ -168,7 +170,7 @@ impl<'a> Sink for SliceSink<'a> {
     #[inline]
     fn extend_from_slice_wild(&mut self, data: &[u8], copy_len: usize) {
         assert!(copy_len <= data.len());
-        self.output[self.pos..self.pos + data.len()].copy_from_slice(data);
+        slice_copy(data, &mut self.output[self.pos..(self.pos) + data.len()]);
         self.pos += copy_len;
     }