diff --git a/src/encode/mod.rs b/src/encode/mod.rs index f5f4e4a..18e23dd 100644 --- a/src/encode/mod.rs +++ b/src/encode/mod.rs @@ -69,30 +69,54 @@ pub fn encode_to_slice(num: T, slice: &mut [u8]) -> u8 { #[cfg(any(target_feature = "sse2", doc))] #[cfg_attr(rustc_nightly, doc(cfg(target_feature = "sse2")))] pub unsafe fn encode_unsafe(num: T) -> ([u8; 16], u8) { - // Break the number into 7-bit parts and spread them out into a vector - let stage1: __m128i = std::mem::transmute(num.num_to_vector_stage1()); + if T::MAX_VARINT_BYTES <= 5 { + // dbg!(num); + let stage1 = num.num_to_scalar_stage1(); - // Create a mask for where there exist values - // This signed comparison works because all MSBs should be cleared at this point - // Also handle the special case when num == 0 - let minimum = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xffu8 as i8); - let exists = _mm_or_si128(_mm_cmpgt_epi8(stage1, _mm_setzero_si128()), minimum); - let bits = _mm_movemask_epi8(exists); + const USE_CTLZ_NONZERO: bool = false; // (cfg!(target_arch = "x86_64") || cfg!(target_arch = "x86")) && !cfg!(target_feature = "lzcnt") && cfg!(rustc_nightly); - // Count the number of bytes used - let bytes = 32 - bits.leading_zeros() as u8; // lzcnt on supported CPUs - // TODO: Compiler emits an unnecessary branch here when using bsr/bsl fallback + // guard against undefined behavior on legacy Intel and eliminate a branch ? + let leading = (stage1).leading_zeros(); + // dbg!(leading); + // dbg!(stage1); - // Fill that many bytes into a vector - let ascend = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let mask = _mm_cmplt_epi8(ascend, _mm_set1_epi8(bytes as i8)); + let unused_bytes = (leading - 1) / 8; + let bytes_needed = 8 - unused_bytes; + // dbg!(bytes_needed); - // Shift it down 1 byte so the last MSB is the only one set, and make sure only the MSB is set - let shift = _mm_bsrli_si128(mask, 1); - let msbmask = _mm_and_si128(shift, _mm_set1_epi8(128u8 as i8)); + // set all but the last MSBs + let msbs = 0x8080808080808080; + let msbmask = 0xFFFFFFFFFFFFFFFF >> ((8 - bytes_needed + 1) * 8 - 1); - // Merge the MSB bits into the vector - let merged = _mm_or_si128(stage1, msbmask); + let merged = stage1 | (msbs & msbmask); - (std::mem::transmute(merged), bytes) + (std::mem::transmute([merged, 0]), bytes_needed as u8) + } else { + // Break the number into 7-bit parts and spread them out into a vector + let stage1: __m128i = std::mem::transmute(num.num_to_vector_stage1()); + + // Create a mask for where there exist values + // This signed comparison works because all MSBs should be cleared at this point + // Also handle the special case when num == 0 + let minimum = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xffu8 as i8); + let exists = _mm_or_si128(_mm_cmpgt_epi8(stage1, _mm_setzero_si128()), minimum); + let bits = _mm_movemask_epi8(exists); + + // Count the number of bytes used + let bytes = 32 - bits.leading_zeros() as u8; // lzcnt on supported CPUs + // TODO: Compiler emits an unnecessary branch here when using bsr/bsl fallback + + // Fill that many bytes into a vector + let ascend = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let mask = _mm_cmplt_epi8(ascend, _mm_set1_epi8(bytes as i8)); + + // Shift it down 1 byte so the last MSB is the only one set, and make sure only the MSB is set + let shift = _mm_bsrli_si128(mask, 1); + let msbmask = _mm_and_si128(shift, _mm_set1_epi8(128u8 as i8)); + + // Merge the MSB bits into the vector + let merged = _mm_or_si128(stage1, msbmask); + + (std::mem::transmute(merged), bytes) + } }