Skip to content

Commit

Permalink
improve safe Decompression 3-28%
Browse files Browse the repository at this point in the history
imrove safe Decompression by 3-28%
improve safe Compression by 0-16%

Replace calls to memcpy with custom function
  • Loading branch information
PSeitz committed May 27, 2023
1 parent 320279f commit 4d36f98
Show file tree
Hide file tree
Showing 4 changed files with 152 additions and 8 deletions.
7 changes: 1 addition & 6 deletions src/block/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -562,12 +562,7 @@ fn push_u32(output: &mut impl Sink, el: u32) {
#[inline(always)] // (always) necessary otherwise compiler fails to inline it
#[cfg(feature = "safe-encode")]
fn copy_literals_wild(output: &mut impl Sink, input: &[u8], input_start: usize, len: usize) {
match len {
0..=8 => output.extend_from_slice_wild(&input[input_start..input_start + 8], len),
9..=16 => output.extend_from_slice_wild(&input[input_start..input_start + 16], len),
17..=24 => output.extend_from_slice_wild(&input[input_start..input_start + 24], len),
_ => output.extend_from_slice_wild(&input[input_start..input_start + len], len),
}
output.extend_from_slice_wild(&input[input_start..input_start + len], len)
}

#[inline]
Expand Down
145 changes: 145 additions & 0 deletions src/fastcpy.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
//! # FastCpy
//!
//! The Rust Compiler calls `memcpy` for slices of unknown length.
//! This crate provides a faster implementation of `memcpy` for slices up to 32bytes (64bytes with `avx`).
//! If you know most of you copy operations are not too big you can use `fastcpy` to speed up your program.
//!
//! `fastcpy` is designed to contain not too much assembly, so the overhead is low.
//!
//! As fall back the standard `memcpy` is called
//!
//! ## Double Copy Trick
//! `fastcpy` employs a double copy trick to copy slices of length 4-32bytes (64bytes with `avx`).
//! E.g. Slice of length 6 can be copied with two uncoditional copy operations.
//!
//! /// [1, 2, 3, 4, 5, 6]
//! /// [1, 2, 3, 4]
//! /// [3, 4, 5, 6]
//!
#[inline]
pub fn slice_copy(src: &[u8], dst: &mut [u8]) {
#[inline(never)]
#[cold]
#[track_caller]
fn len_mismatch_fail(dst_len: usize, src_len: usize) -> ! {
panic!(
"source slice length ({}) does not match destination slice length ({})",
src_len, dst_len,
);
}

if src.len() != dst.len() {
len_mismatch_fail(src.len(), dst.len());
}
let len = src.len();

if src.is_empty() {
return;
}

if len < 4 {
short_copy(src, dst);
return;
}

if len < 8 {
double_copy_trick::<4>(src, dst);
return;
}

if len <= 16 {
double_copy_trick::<8>(src, dst);
return;
}

if len <= 32 {
double_copy_trick::<16>(src, dst);
return;
}

/// The code will use the vmovdqu instruction to copy 32 bytes at a time.
#[cfg(target_feature = "avx")]
{
if len <= 64 {
double_copy_trick::<32>(src, dst);
return;
}
}

// For larger sizes we use the default, which calls memcpy
// memcpy does some virtual memory tricks to copy large chunks of memory.
//
// The theory should be that the checks above don't cost much relative to the copy call for
// larger copies.
// The bounds checks in `copy_from_slice` are elided.
dst.copy_from_slice(src);
}

#[inline(always)]
fn short_copy(src: &[u8], dst: &mut [u8]) {
let len = src.len();

// length 1-3
dst[0] = src[0];
if len >= 2 {
double_copy_trick::<2>(src, dst);
}
}

#[inline(always)]
/// [1, 2, 3, 4, 5, 6]
/// [1, 2, 3, 4]
/// [3, 4, 5, 6]
fn double_copy_trick<const SIZE: usize>(src: &[u8], dst: &mut [u8]) {
dst[0..SIZE].copy_from_slice(&src[0..SIZE]);
dst[src.len() - SIZE..].copy_from_slice(&src[src.len() - SIZE..]);
}

#[cfg(test)]
mod tests {
use super::slice_copy;
use proptest::prelude::*;

proptest! {
#[test]
fn test_fast_short_slice_copy(left: Vec<u8>) {
let mut right = vec![0u8; left.len()];
slice_copy(&left, &mut right);
prop_assert_eq!(&left, &right);
}
}

#[test]
fn test_fast_short_slice_copy_edge_cases() {
for len in 0..(512 * 2) {
let left = (0..len).map(|i| i as u8).collect::<Vec<_>>();
let mut right = vec![0u8; len];
slice_copy(&left, &mut right);
assert_eq!(left, right);
}
}

#[test]
fn test_fail2() {
let left = vec![
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31, 32,
];
let mut right = vec![0u8; left.len()];
slice_copy(&left, &mut right);
assert_eq!(left, right);
}

#[test]
fn test_fail() {
let left = vec![
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
];
let mut right = vec![0u8; left.len()];
slice_copy(&left, &mut right);
assert_eq!(left, right);
}
}
4 changes: 3 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,10 @@ pub mod block;
#[cfg_attr(docsrs, doc(cfg(feature = "frame")))]
pub mod frame;

pub use block::{compress, compress_into, compress_prepend_size};
#[allow(dead_code)]
mod fastcpy;

pub use block::{compress, compress_into, compress_prepend_size};
pub use block::{decompress, decompress_into, decompress_size_prepended};

#[cfg_attr(
Expand Down
4 changes: 3 additions & 1 deletion src/sink.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#[allow(unused_imports)]
use alloc::vec::Vec;

use crate::fastcpy::slice_copy;

/// Returns a Sink implementation appropriate for outputing up to `required_capacity`
/// bytes at `vec[offset..offset+required_capacity]`.
/// It can be either a `SliceSink` (pre-filling the vec with zeroes if necessary)
Expand Down Expand Up @@ -168,7 +170,7 @@ impl<'a> Sink for SliceSink<'a> {
#[inline]
fn extend_from_slice_wild(&mut self, data: &[u8], copy_len: usize) {
assert!(copy_len <= data.len());
self.output[self.pos..self.pos + data.len()].copy_from_slice(data);
slice_copy(data, &mut self.output[self.pos..(self.pos) + data.len()]);
self.pos += copy_len;
}

Expand Down

0 comments on commit 4d36f98

Please sign in to comment.