From 96bfbb4c1260e0339b9ee5c8b615faa59f0292c8 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Sat, 27 May 2023 17:47:05 +0800 Subject: [PATCH] improve safe Decompression 3-28% imrove safe Decompression by 3-28% improve safe Compression by 0-16% Replace calls to memcpy with custom function --- src/block/compress.rs | 7 +- src/fastcpy.rs | 145 ++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 4 +- src/sink.rs | 4 +- 4 files changed, 152 insertions(+), 8 deletions(-) create mode 100644 src/fastcpy.rs diff --git a/src/block/compress.rs b/src/block/compress.rs index 466783c1..8f886726 100644 --- a/src/block/compress.rs +++ b/src/block/compress.rs @@ -562,12 +562,7 @@ fn push_u32(output: &mut impl Sink, el: u32) { #[inline(always)] // (always) necessary otherwise compiler fails to inline it #[cfg(feature = "safe-encode")] fn copy_literals_wild(output: &mut impl Sink, input: &[u8], input_start: usize, len: usize) { - match len { - 0..=8 => output.extend_from_slice_wild(&input[input_start..input_start + 8], len), - 9..=16 => output.extend_from_slice_wild(&input[input_start..input_start + 16], len), - 17..=24 => output.extend_from_slice_wild(&input[input_start..input_start + 24], len), - _ => output.extend_from_slice_wild(&input[input_start..input_start + len], len), - } + output.extend_from_slice_wild(&input[input_start..input_start + len], len) } #[inline] diff --git a/src/fastcpy.rs b/src/fastcpy.rs new file mode 100644 index 00000000..64267e36 --- /dev/null +++ b/src/fastcpy.rs @@ -0,0 +1,145 @@ +//! # FastCpy +//! +//! The Rust Compiler calls `memcpy` for slices of unknown length. +//! This crate provides a faster implementation of `memcpy` for slices up to 32bytes (64bytes with `avx`). +//! If you know most of you copy operations are not too big you can use `fastcpy` to speed up your program. +//! +//! `fastcpy` is designed to contain not too much assembly, so the overhead is low. +//! +//! As fall back the standard `memcpy` is called +//! +//! ## Double Copy Trick +//! `fastcpy` employs a double copy trick to copy slices of length 4-32bytes (64bytes with `avx`). +//! E.g. Slice of length 6 can be copied with two uncoditional copy operations. +//! +//! /// [1, 2, 3, 4, 5, 6] +//! /// [1, 2, 3, 4] +//! /// [3, 4, 5, 6] +//! + +#[inline] +pub fn slice_copy(src: &[u8], dst: &mut [u8]) { + #[inline(never)] + #[cold] + #[track_caller] + fn len_mismatch_fail(dst_len: usize, src_len: usize) -> ! { + panic!( + "source slice length ({}) does not match destination slice length ({})", + src_len, dst_len, + ); + } + + if src.len() != dst.len() { + len_mismatch_fail(src.len(), dst.len()); + } + let len = src.len(); + + if src.is_empty() { + return; + } + + if len < 4 { + short_copy(src, dst); + return; + } + + if len < 8 { + double_copy_trick::<4>(src, dst); + return; + } + + if len <= 16 { + double_copy_trick::<8>(src, dst); + return; + } + + if len <= 32 { + double_copy_trick::<16>(src, dst); + return; + } + + /// The code will use the vmovdqu instruction to copy 32 bytes at a time. + #[cfg(target_feature = "avx")] + { + if len <= 64 { + double_copy_trick::<32>(src, dst); + return; + } + } + + // For larger sizes we use the default, which calls memcpy + // memcpy does some virtual memory tricks to copy large chunks of memory. + // + // The theory should be that the checks above don't cost much relative to the copy call for + // larger copies. + // The bounds checks in `copy_from_slice` are elided. + dst.copy_from_slice(src); +} + +#[inline(always)] +fn short_copy(src: &[u8], dst: &mut [u8]) { + let len = src.len(); + + // length 1-3 + dst[0] = src[0]; + if len >= 2 { + double_copy_trick::<2>(src, dst); + } +} + +#[inline(always)] +/// [1, 2, 3, 4, 5, 6] +/// [1, 2, 3, 4] +/// [3, 4, 5, 6] +fn double_copy_trick(src: &[u8], dst: &mut [u8]) { + dst[0..SIZE].copy_from_slice(&src[0..SIZE]); + dst[src.len() - SIZE..].copy_from_slice(&src[src.len() - SIZE..]); +} + +#[cfg(test)] +mod tests { + use super::slice_copy; + use proptest::prelude::*; + + proptest! { + #[test] + fn test_fast_short_slice_copy(left: Vec) { + let mut right = vec![0u8; left.len()]; + slice_copy(&left, &mut right); + prop_assert_eq!(&left, &right); + } + } + + #[test] + fn test_fast_short_slice_copy_edge_cases() { + for len in 0..(512 * 2) { + let left = (0..len).map(|i| i as u8).collect::>(); + let mut right = vec![0u8; len]; + slice_copy(&left, &mut right); + assert_eq!(left, right); + } + } + + #[test] + fn test_fail2() { + let left = vec![ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]; + let mut right = vec![0u8; left.len()]; + slice_copy(&left, &mut right); + assert_eq!(left, right); + } + + #[test] + fn test_fail() { + let left = vec![ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + let mut right = vec![0u8; left.len()]; + slice_copy(&left, &mut right); + assert_eq!(left, right); + } +} diff --git a/src/lib.rs b/src/lib.rs index e13a7ad3..516ae824 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -89,8 +89,10 @@ pub mod block; #[cfg_attr(docsrs, doc(cfg(feature = "frame")))] pub mod frame; -pub use block::{compress, compress_into, compress_prepend_size}; +#[allow(dead_code)] +mod fastcpy; +pub use block::{compress, compress_into, compress_prepend_size}; pub use block::{decompress, decompress_into, decompress_size_prepended}; #[cfg_attr( diff --git a/src/sink.rs b/src/sink.rs index 556008a6..de97770f 100644 --- a/src/sink.rs +++ b/src/sink.rs @@ -1,6 +1,8 @@ #[allow(unused_imports)] use alloc::vec::Vec; +use crate::fastcpy::slice_copy; + /// Returns a Sink implementation appropriate for outputing up to `required_capacity` /// bytes at `vec[offset..offset+required_capacity]`. /// It can be either a `SliceSink` (pre-filling the vec with zeroes if necessary) @@ -168,7 +170,7 @@ impl<'a> Sink for SliceSink<'a> { #[inline] fn extend_from_slice_wild(&mut self, data: &[u8], copy_len: usize) { assert!(copy_len <= data.len()); - self.output[self.pos..self.pos + data.len()].copy_from_slice(data); + slice_copy(data, &mut self.output[self.pos..(self.pos) + data.len()]); self.pos += copy_len; }