From 262452308a3192fecc3f4d88a2690dc67e681ef5 Mon Sep 17 00:00:00 2001 From: The 8472 Date: Wed, 4 Oct 2023 16:21:02 +0200 Subject: [PATCH 1/3] add benchmark for generic slice PartialEq impl --- library/core/benches/slice.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/library/core/benches/slice.rs b/library/core/benches/slice.rs index 3bfb35e684ea1..1ec51653d92ef 100644 --- a/library/core/benches/slice.rs +++ b/library/core/benches/slice.rs @@ -171,3 +171,17 @@ fn fold_to_last(b: &mut Bencher) { let slice: &[i32] = &[0; 1024]; b.iter(|| black_box(slice).iter().fold(None, |_, r| Some(NonNull::from(r)))); } + +#[bench] +fn slice_cmp_generic(b: &mut Bencher) { + #[derive(PartialEq, Clone, Copy)] + struct Foo(u32, u32); + + let left = [Foo(128, 128); 128]; + let right = [Foo(128, 128); 128]; + + b.iter(|| { + let (left, right) = (black_box(&left), black_box(&right)); + left.as_slice() == right.as_slice() + }); +} From 2f78bce940fbcf4d1fb0ca14c650aeab870a1683 Mon Sep 17 00:00:00 2001 From: The 8472 Date: Wed, 4 Oct 2023 16:21:21 +0200 Subject: [PATCH 2/3] unroll slice::equal impl --- library/core/src/slice/cmp.rs | 43 ++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/library/core/src/slice/cmp.rs b/library/core/src/slice/cmp.rs index 075347b80d031..3231e4313a8a3 100644 --- a/library/core/src/slice/cmp.rs +++ b/library/core/src/slice/cmp.rs @@ -55,15 +55,56 @@ impl SlicePartialEq for [A] where A: PartialEq, { + #[inline] default fn equal(&self, other: &[B]) -> bool { if self.len() != other.len() { return false; } - self.iter().zip(other.iter()).all(|(x, y)| x == y) + // at least 8 items for unrolling to make sense (4 peeled + 4+ unrolled) + if self.len() < 8 { + return eq_small(self, other); + } + + eq_unroll(self, other) } } +#[inline] +fn eq_small(a: &[A], b: &[B]) -> bool +where + A: PartialEq, +{ + a.iter().zip(b).all(|(a, b)| a == b) +} + +fn eq_unroll(a: &[A], b: &[B]) -> bool +where + A: PartialEq, +{ + let (mut chunks_a, residual_a) = a.as_chunks::<4>(); + let (mut chunks_b, residual_b) = b.as_chunks::<4>(); + let peeled_a = chunks_a.take_first().unwrap(); + let peeled_b = chunks_b.take_first().unwrap(); + + // peel the first chunk and do a short-circuiting comparison to bail early on mismatches + // in case comparisons are expensive + let mut result = eq_small(peeled_a, peeled_b); + + // then check the residual, another chance to bail early + result = result && eq_small(residual_a, residual_b); + + // iter.all short-circuits which means the backend can't unroll the loop due to early exits. + // So we unroll it manually. + result = result + && chunks_a + .iter() + .zip(chunks_b) + .all(|(a, b)| (a[0] == b[0]) & (a[1] == b[1]) & (a[2] == b[2]) & (a[3] == b[3])); + + result +} + // When each element can be compared byte-wise, we can compare all the bytes // from the whole size in one call to the intrinsics. impl SlicePartialEq for [A] From ef4600dc88eb9dffd45cac9df5bea9007326bd08 Mon Sep 17 00:00:00 2001 From: The 8472 Date: Thu, 5 Oct 2023 14:39:17 +0200 Subject: [PATCH 3/3] handcode the loops so LLVM has to chew less IR --- library/core/src/slice/cmp.rs | 70 ++++++++++++++++------------------- 1 file changed, 31 insertions(+), 39 deletions(-) diff --git a/library/core/src/slice/cmp.rs b/library/core/src/slice/cmp.rs index 3231e4313a8a3..8082b48552c73 100644 --- a/library/core/src/slice/cmp.rs +++ b/library/core/src/slice/cmp.rs @@ -61,48 +61,40 @@ where return false; } - // at least 8 items for unrolling to make sense (4 peeled + 4+ unrolled) - if self.len() < 8 { - return eq_small(self, other); + // ZSTs have no identity and slices don't guarantee which addresses-to-ZSTs they produce + // so we only need to compare them once to determine the behavior of the PartialEq impl + if const { mem::size_of::() == 0 && mem::size_of::() == 0 } { + // zero-length slices are always equal + if self.len() == 0 { + return true; + } + // SAFETY: A and B are ZSTs so it's ok to conjure them out of thin air + return unsafe { mem::zeroed::() == mem::zeroed::() }; } - eq_unroll(self, other) - } -} - -#[inline] -fn eq_small(a: &[A], b: &[B]) -> bool -where - A: PartialEq, -{ - a.iter().zip(b).all(|(a, b)| a == b) -} + const UNROLL: usize = 4; + let mut i = 0; + let mut is_eq = true; + while i + UNROLL < self.len() && is_eq { + // SAFETY: slices are of the same length and loop conditions ensure indexes are in bounds + unsafe { + is_eq = is_eq & (self.get_unchecked(i) == other.get_unchecked(i)); + is_eq = is_eq & (self.get_unchecked(i + 1) == other.get_unchecked(i + 1)); + is_eq = is_eq & (self.get_unchecked(i + 2) == other.get_unchecked(i + 2)); + is_eq = is_eq & (self.get_unchecked(i + 3) == other.get_unchecked(i + 3)); + i = i.unchecked_add(UNROLL); + } + } + while i < self.len() && is_eq { + // SAFETY: slices are of the same length and loop conditions ensure indexes are in bounds + unsafe { + is_eq = is_eq & (self.get_unchecked(i) == other.get_unchecked(i)); + i = i.unchecked_add(1); + } + } -fn eq_unroll(a: &[A], b: &[B]) -> bool -where - A: PartialEq, -{ - let (mut chunks_a, residual_a) = a.as_chunks::<4>(); - let (mut chunks_b, residual_b) = b.as_chunks::<4>(); - let peeled_a = chunks_a.take_first().unwrap(); - let peeled_b = chunks_b.take_first().unwrap(); - - // peel the first chunk and do a short-circuiting comparison to bail early on mismatches - // in case comparisons are expensive - let mut result = eq_small(peeled_a, peeled_b); - - // then check the residual, another chance to bail early - result = result && eq_small(residual_a, residual_b); - - // iter.all short-circuits which means the backend can't unroll the loop due to early exits. - // So we unroll it manually. - result = result - && chunks_a - .iter() - .zip(chunks_b) - .all(|(a, b)| (a[0] == b[0]) & (a[1] == b[1]) & (a[2] == b[2]) & (a[3] == b[3])); - - result + is_eq + } } // When each element can be compared byte-wise, we can compare all the bytes