From 8192e116f2b88eb65f80a366c63e1abee6415915 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 23 Nov 2021 15:09:48 +0100 Subject: [PATCH 1/8] Add AVX2 version of AddVector --- .../Formats/Webp/Lossless/Vp8LHistogram.cs | 51 +++++++++++++++++-- .../Formats/Webp/Lossy/Vp8Histogram.cs | 2 +- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs index bdb53f5c6a..ac8cc0f655 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs @@ -3,10 +3,16 @@ using System; using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif namespace SixLabors.ImageSharp.Formats.Webp.Lossless { - internal class Vp8LHistogram : IDeepCloneable + internal sealed class Vp8LHistogram : IDeepCloneable { private const uint NonTrivialSym = 0xffffffff; @@ -505,11 +511,48 @@ private static double ExtraCost(Span population, int length) return cost; } - private static void AddVector(uint[] a, uint[] b, uint[] output, int size) + private static void AddVector(Span a, Span b, Span output, int size) { - for (int i = 0; i < size; i++) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) { - output[i] = a[i] + b[i]; + ref uint aRef = ref MemoryMarshal.GetReference(a); + ref uint bRef = ref MemoryMarshal.GetReference(b); + ref uint outputRef = ref MemoryMarshal.GetReference(output); + int i; + + for (i = 0; i + 32 <= size; i += 32) + { + // Load values. + Vector256 a0 = Unsafe.As>(ref Unsafe.Add(ref aRef, i)); + Vector256 a1 = Unsafe.As>(ref Unsafe.Add(ref aRef, i + 8)); + Vector256 a2 = Unsafe.As>(ref Unsafe.Add(ref aRef, i + 16)); + Vector256 a3 = Unsafe.As>(ref Unsafe.Add(ref aRef, i + 24)); + Vector256 b0 = Unsafe.As>(ref Unsafe.Add(ref bRef, i)); + Vector256 b1 = Unsafe.As>(ref Unsafe.Add(ref bRef, i + 8)); + Vector256 b2 = Unsafe.As>(ref Unsafe.Add(ref bRef, i + 16)); + Vector256 b3 = Unsafe.As>(ref Unsafe.Add(ref bRef, i + 24)); + + // Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But + // that's ok since the histogram values are less than 1<<28 (max picture size). + Unsafe.As>(ref Unsafe.Add(ref outputRef, i)) = Avx2.Add(a0, b0); + Unsafe.As>(ref Unsafe.Add(ref outputRef, i + 8)) = Avx2.Add(a1, b1); + Unsafe.As>(ref Unsafe.Add(ref outputRef, i + 16)) = Avx2.Add(a2, b2); + Unsafe.As>(ref Unsafe.Add(ref outputRef, i + 24)) = Avx2.Add(a3, b3); + } + + for (; i < size; i++) + { + output[i] = a[i] + b[i]; + } + } + else +#endif + { + for (int i = 0; i < size; i++) + { + output[i] = a[i] + b[i]; + } } } } diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs index 6e724e4758..89e7baff39 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs @@ -6,7 +6,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy { - internal class Vp8Histogram + internal sealed class Vp8Histogram { private readonly int[] scratch = new int[16]; From a45f49517b7fb4ed7981becb58898ea175273e80 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 23 Nov 2021 15:59:59 +0100 Subject: [PATCH 2/8] Avoid bounds checks in VectorMismatch --- src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 471c083cda..d24431600d 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -81,7 +81,9 @@ public static int VectorMismatch(ReadOnlySpan array1, ReadOnlySpan a { int matchLen = 0; - while (matchLen < length && array1[matchLen] == array2[matchLen]) + ref uint array1Ref = ref MemoryMarshal.GetReference(array1); + ref uint array2Ref = ref MemoryMarshal.GetReference(array2); + while (matchLen < length && Unsafe.Add(ref array1Ref, matchLen) == Unsafe.Add(ref array2Ref, matchLen)) { matchLen++; } From 6393484e4283719575451aba7d18d91d7d86b6af Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 23 Nov 2021 17:15:42 +0100 Subject: [PATCH 3/8] Remove duplicate FTransform method --- .../Formats/Webp/Lossy/Vp8Histogram.cs | 44 +------------------ 1 file changed, 1 insertion(+), 43 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs index 89e7baff39..d384302b94 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs @@ -49,7 +49,7 @@ public void CollectHistogram(Span reference, Span pred, int startBlo this.distribution.AsSpan().Clear(); for (j = startBlock; j < endBlock; j++) { - this.Vp8FTransform(reference.Slice(WebpLookupTables.Vp8DspScan[j]), pred.Slice(WebpLookupTables.Vp8DspScan[j]), this.output); + Vp8Encoding.FTransform(reference.Slice(WebpLookupTables.Vp8DspScan[j]), pred.Slice(WebpLookupTables.Vp8DspScan[j]), this.output, this.scratch); // Convert coefficients to bin. for (int k = 0; k < 16; ++k) @@ -98,48 +98,6 @@ private void SetHistogramData(int[] distribution) this.lastNonZero = lastNonZero; } - private void Vp8FTransform(Span src, Span reference, Span output) - { - int i; - Span tmp = this.scratch; - tmp.Clear(); - - for (i = 0; i < 4; i++) - { - int d0 = src[0] - reference[0]; // 9bit dynamic range ([-255,255]) - int d1 = src[1] - reference[1]; - int d2 = src[2] - reference[2]; - int d3 = src[3] - reference[3]; - int a0 = d0 + d3; // 10b [-510,510] - int a1 = d1 + d2; - int a2 = d1 - d2; - int a3 = d0 - d3; - tmp[0 + (i * 4)] = (a0 + a1) * 8; // 14b [-8160,8160] - tmp[1 + (i * 4)] = ((a2 * 2217) + (a3 * 5352) + 1812) >> 9; // [-7536,7542] - tmp[2 + (i * 4)] = (a0 - a1) * 8; - tmp[3 + (i * 4)] = ((a3 * 2217) - (a2 * 5352) + 937) >> 9; - - // Do not change the span in the last iteration. - if (i < 3) - { - src = src.Slice(WebpConstants.Bps); - reference = reference.Slice(WebpConstants.Bps); - } - } - - for (i = 0; i < 4; i++) - { - int a0 = tmp[0 + i] + tmp[12 + i]; // 15b - int a1 = tmp[4 + i] + tmp[8 + i]; - int a2 = tmp[4 + i] - tmp[8 + i]; - int a3 = tmp[0 + i] - tmp[12 + i]; - output[0 + i] = (short)((a0 + a1 + 7) >> 4); // 12b - output[4 + i] = (short)((((a2 * 2217) + (a3 * 5352) + 12000) >> 16) + (a3 != 0 ? 1 : 0)); - output[8 + i] = (short)((a0 - a1 + 7) >> 4); - output[12 + i] = (short)(((a3 * 2217) - (a2 * 5352) + 51000) >> 16); - } - } - [MethodImpl(InliningOptions.ShortMethod)] private static int ClipMax(int v, int max) => v > max ? max : v; } From 491b742ae4a70dc6c5ff1e6c3a9db9cb7f00fcf3 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 23 Nov 2021 18:38:19 +0100 Subject: [PATCH 4/8] Add SSE2 version of VectorMismatch --- .../Formats/Webp/Lossless/LosslessUtils.cs | 65 +++++++++++++++++-- 1 file changed, 61 insertions(+), 4 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index d24431600d..319aa8c3d6 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -80,15 +80,72 @@ public static int FindMatchLength(ReadOnlySpan array1, ReadOnlySpan public static int VectorMismatch(ReadOnlySpan array1, ReadOnlySpan array2, int length) { int matchLen = 0; - ref uint array1Ref = ref MemoryMarshal.GetReference(array1); ref uint array2Ref = ref MemoryMarshal.GetReference(array2); - while (matchLen < length && Unsafe.Add(ref array1Ref, matchLen) == Unsafe.Add(ref array2Ref, matchLen)) + +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - matchLen++; + if (length >= 12) + { + Vector128 a0 = Unsafe.As>(ref array1Ref); + Vector128 a1 = Unsafe.As>(ref array2Ref); + + do + { + // Loop unrolling and early load both provide a speedup. + Vector128 cmpA = Sse2.CompareEqual(a0, a1); + Vector128 b0 = Unsafe.As>(ref Unsafe.Add(ref array1Ref, matchLen + 4)); + Vector128 b1 = Unsafe.As>(ref Unsafe.Add(ref array2Ref, matchLen + 4)); + if (Sse2.MoveMask(cmpA.AsByte()) != 0xffff) + { + break; + } + + matchLen += 4; + + Vector128 cmpB = Sse2.CompareEqual(b0, b1); + a0 = Unsafe.As>(ref Unsafe.Add(ref array1Ref, matchLen + 4)); + a1 = Unsafe.As>(ref Unsafe.Add(ref array2Ref, matchLen + 4)); + if (Sse2.MoveMask(cmpB.AsByte()) != 0xffff) + { + break; + } + + matchLen += 4; + } + while (matchLen + 12 < length); + } + else + { + // Unroll the potential first two loops. + if (length >= 4 + && Sse2.MoveMask( + Sse2.CompareEqual( + Unsafe.As>(ref array1Ref), + Unsafe.As>(ref array2Ref)).AsByte()) == 0xffff) + { + matchLen = 4; + if (length >= 8 + && Sse2.MoveMask( + Sse2.CompareEqual( + Unsafe.As>(ref Unsafe.Add(ref array1Ref, 4)), + Unsafe.As>(ref Unsafe.Add(ref array2Ref, 4))).AsByte()) == 0xffff) + { + matchLen = 8; + } + } + } } +#endif + { + while (matchLen < length && Unsafe.Add(ref array1Ref, matchLen) == Unsafe.Add(ref array2Ref, matchLen)) + { + matchLen++; + } - return matchLen; + return matchLen; + } } [MethodImpl(InliningOptions.ShortMethod)] From 427a39213e2e7429667d376a8b86c09b47f6da62 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 23 Nov 2021 23:11:06 +0100 Subject: [PATCH 5/8] Add VectorMismatch tests --- .../Formats/WebP/LosslessUtilsTests.cs | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs index 97567ba218..b6f15e5376 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs @@ -10,6 +10,24 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp [Trait("Format", "Webp")] public class LosslessUtilsTests { + private static void RunVectorMismatchTest() + { + uint[] array1 = { 4278193152, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896 }; + uint[] array2 = { 4278193152, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896 }; + int expected1 = 18; + + // Test unroll first two loops path also. + uint[] array3 = { 4279238656, 4278714368, 4279238656, 4279238656, 4279238656, 4279238656, 4279238896, 4279238896, 4279238884 }; + uint[] array4 = { 4279238656, 4278714368, 4279238656, 4279238656, 4278190080, 4278190080, 4278190080, 4278190080, 4278190080 }; + int expected2 = 4; + + int actual1 = LosslessUtils.VectorMismatch(array1, array2, 18); + int actual2 = LosslessUtils.VectorMismatch(array3, array4, 9); + + Assert.Equal(expected1, actual1); + Assert.Equal(expected2, actual2); + } + private static void RunSubtractGreenTest() { uint[] pixelData = @@ -193,6 +211,9 @@ private static void RunPredictor13Test() } } + [Fact] + public void VectorMismatch_Works() => RunVectorMismatchTest(); + [Fact] public void Predictor11_Works() => RunPredictor11Test(); @@ -215,6 +236,12 @@ private static void RunPredictor13Test() public void TransformColorInverse_Works() => RunTransformColorInverseTest(); #if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void VectorMismatch_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVectorMismatchTest, HwIntrinsics.AllowAll); + + [Fact] + public void VectorMismatch_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVectorMismatchTest, HwIntrinsics.DisableSSE2); + [Fact] public void Predictor11_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor11Test, HwIntrinsics.AllowAll); From fa24760a88e0669a49eaae2e0ef5ef209a86c0d1 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 24 Nov 2021 12:20:28 +0100 Subject: [PATCH 6/8] Add AddVector tests --- .../Formats/WebP/Vp8LHistogramTests.cs | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 tests/ImageSharp.Tests/Formats/WebP/Vp8LHistogramTests.cs diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8LHistogramTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8LHistogramTests.cs new file mode 100644 index 0000000000..f39e16bc24 --- /dev/null +++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8LHistogramTests.cs @@ -0,0 +1,109 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Linq; +using SixLabors.ImageSharp.Formats.Webp.Lossless; +using SixLabors.ImageSharp.Tests.TestUtilities; +using Xunit; + +namespace SixLabors.ImageSharp.Tests.Formats.WebP +{ + public class Vp8LHistogramTests + { + private static void RunAddVectorTest() + { + // arrange + uint[] pixelData = + { + 4278191104, 4278191104, 4278191104, 4278191104, 4278191104, 4278191104, 4278191104, 4294577152, + 4294707200, 4294707200, 4294707200, 4294707200, 4294837248, 4294837248, 4293926912, 4294316544, + 4278191104, 4278191104, 4294837248, 4294837248, 4280287232, 4280350720, 4294447104, 4294707200, + 4294838272, 4278516736, 4294837248, 4294837248, 4278516736, 4294707200, 4279298048, 4294837248, + 4294837248, 4294837248, 4294837248, 4280287232, 4280287232, 4292670464, 4279633408, 4294838272, + 4294837248, 4278516736, 4278516736, 4278516736, 4278516736, 4278516736, 4278778880, 4278193152, + 4278191104, 4280287232, 4280287232, 4280287232, 4280287232, 4293971968, 4280612864, 4292802560, + 4294837760, 4278516736, 4278516736, 4294837760, 4294707712, 4278516736, 4294837248, 4278193152, + 4280287232, 4278984704, 4280287232, 4278243328, 4280287232, 4278244352, 4280287232, 4280025088, + 4280025088, 4294837760, 4278192128, 4294838784, 4294837760, 4294707712, 4278778880, 4278324224, + 4280287232, 4280287232, 4278202368, 4279115776, 4280287232, 4278243328, 4280287232, 4280287232, + 4280025088, 4280287232, 4278192128, 4294838272, 4294838272, 4294837760, 4278190592, 4278778880, + 4280875008, 4280287232, 4279896576, 4281075712, 4281075712, 4280287232, 4280287232, 4280287232, + 4280287232, 4280287232, 4278190592, 4294709248, 4278516736, 4278516736, 4278584832, 4278909440, + 4280287232, 4280287232, 4294367744, 4294621184, 4279115776, 4280287232, 4280287232, 4280351744, + 4280287232, 4280287232, 4280287232, 4278513664, 4278516736, 4278716416, 4278584832, 4280291328, + 4293062144, 4280287232, 4280287232, 4280287232, 4294456320, 4280291328, 4280287232, 4280287232, + 4280287232, 4280287232, 4280287232, 4280287232, 4278513152, 4278716416, 4278584832, 4280291328, + 4278198272, 4278198272, 4278589952, 4278198272, 4278198272, 4280287232, 4278765568, 4280287232, + 4280287232, 4280287232, 4280287232, 4294712832, 4278513152, 4278716640, 4279300608, 4278584832, + 4280156672, 4279373312, 4278589952, 4279373312, 4278328832, 4278328832, 4278328832, 4279634432, + 4280287232, 4280287232, 4280287232, 4280287232, 4278457344, 4280483328, 4278584832, 4278385664, + 4279634432, 4279373312, 4279634432, 4280287232, 4280287232, 4280156672, 4278589952, 4278328832, + 4278198272, 4280156672, 4280483328, 4294363648, 4280287232, 4278376448, 4280287232, 4278647808, + 4280287232, 4280287232, 4279373312, 4280287232, 4280287232, 4280156672, 4280287232, 4278198272, + 4278198272, 4280156672, 4280287232, 4280287232, 4293669888, 4278765568, 4278765568, 4280287232, + 4280287232, 4280287232, 4279634432, 4279634432, 4280287232, 4280287232, 4280287232, 4280287232, + 4280287232, 4280287232, 4280287232, 4280287232, 4279373312, 4279764992, 4293539328, 4279896576, + 4280287232, 4280287232, 4280287232, 4279634432, 4278198272, 4279634432, 4280287232, 4280287232, + 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4279503872, 4279503872, 4280288256, + 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, + 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4280287232, 4280287232 + }; + + uint[] literals = + { + 198, 0, 14, 0, 46, 0, 22, 0, 36, 0, 24, 0, 12, 0, 10, 0, 10, 0, 2, 0, 2, 0, 0, 0, 0, 0, 6, 0, 0, 0, + 10, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 0, 2, 0, 0, 0, 0, 0, 6, 0, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 6, 0, 2, 0, 2, 0, 2, 0, 0, 0, 8, 0, 2, 0, 38, 0, 4 + }; + + uint[] expectedLiterals = new uint[1305]; + + // All remaining values are expected to be zero. + literals.AsSpan().CopyTo(expectedLiterals); + + var backwardRefs = new Vp8LBackwardRefs(pixelData.Length); + for (int i = 0; i < pixelData.Length; i++) + { + backwardRefs.Add(new PixOrCopy() + { + BgraOrDistance = pixelData[i], + Len = 1, + Mode = PixOrCopyMode.Literal + }); + } + + var histogram0 = new Vp8LHistogram(backwardRefs, 3); + var histogram1 = new Vp8LHistogram(backwardRefs, 3); + for (int i = 0; i < 5; i++) + { + histogram0.IsUsed[i] = true; + histogram1.IsUsed[i] = true; + } + + var output = new Vp8LHistogram(3); + + // act + histogram0.Add(histogram1, output); + + // assert + Assert.True(output.Literal.SequenceEqual(expectedLiterals)); + } + + [Fact] + public void AddVector_Works() => RunAddVectorTest(); + +#if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void AddVector_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddVectorTest, HwIntrinsics.AllowAll); + + [Fact] + public void AddVector_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunAddVectorTest, HwIntrinsics.DisableAVX2); +#endif + } +} From c174ab42bea366257a410e8335055b9d27d487ff Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 24 Nov 2021 20:13:46 +0100 Subject: [PATCH 7/8] Remove SSE2 version of VectorMismatch: Profiling does not show any speedup --- .../Formats/Webp/Lossless/LosslessUtils.cs | 63 +------------------ .../Formats/WebP/LosslessUtilsTests.cs | 28 +-------- 2 files changed, 4 insertions(+), 87 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 319aa8c3d6..0ed180a184 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -83,69 +83,12 @@ public static int VectorMismatch(ReadOnlySpan array1, ReadOnlySpan a ref uint array1Ref = ref MemoryMarshal.GetReference(array1); ref uint array2Ref = ref MemoryMarshal.GetReference(array2); -#if SUPPORTS_RUNTIME_INTRINSICS - if (Sse2.IsSupported) + while (matchLen < length && Unsafe.Add(ref array1Ref, matchLen) == Unsafe.Add(ref array2Ref, matchLen)) { - if (length >= 12) - { - Vector128 a0 = Unsafe.As>(ref array1Ref); - Vector128 a1 = Unsafe.As>(ref array2Ref); - - do - { - // Loop unrolling and early load both provide a speedup. - Vector128 cmpA = Sse2.CompareEqual(a0, a1); - Vector128 b0 = Unsafe.As>(ref Unsafe.Add(ref array1Ref, matchLen + 4)); - Vector128 b1 = Unsafe.As>(ref Unsafe.Add(ref array2Ref, matchLen + 4)); - if (Sse2.MoveMask(cmpA.AsByte()) != 0xffff) - { - break; - } - - matchLen += 4; - - Vector128 cmpB = Sse2.CompareEqual(b0, b1); - a0 = Unsafe.As>(ref Unsafe.Add(ref array1Ref, matchLen + 4)); - a1 = Unsafe.As>(ref Unsafe.Add(ref array2Ref, matchLen + 4)); - if (Sse2.MoveMask(cmpB.AsByte()) != 0xffff) - { - break; - } - - matchLen += 4; - } - while (matchLen + 12 < length); - } - else - { - // Unroll the potential first two loops. - if (length >= 4 - && Sse2.MoveMask( - Sse2.CompareEqual( - Unsafe.As>(ref array1Ref), - Unsafe.As>(ref array2Ref)).AsByte()) == 0xffff) - { - matchLen = 4; - if (length >= 8 - && Sse2.MoveMask( - Sse2.CompareEqual( - Unsafe.As>(ref Unsafe.Add(ref array1Ref, 4)), - Unsafe.As>(ref Unsafe.Add(ref array2Ref, 4))).AsByte()) == 0xffff) - { - matchLen = 8; - } - } - } + matchLen++; } -#endif - { - while (matchLen < length && Unsafe.Add(ref array1Ref, matchLen) == Unsafe.Add(ref array2Ref, matchLen)) - { - matchLen++; - } - return matchLen; - } + return matchLen; } [MethodImpl(InliningOptions.ShortMethod)] diff --git a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs index b6f15e5376..62e23c1cdf 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs @@ -10,24 +10,6 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp [Trait("Format", "Webp")] public class LosslessUtilsTests { - private static void RunVectorMismatchTest() - { - uint[] array1 = { 4278193152, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896 }; - uint[] array2 = { 4278193152, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896, 4278192896 }; - int expected1 = 18; - - // Test unroll first two loops path also. - uint[] array3 = { 4279238656, 4278714368, 4279238656, 4279238656, 4279238656, 4279238656, 4279238896, 4279238896, 4279238884 }; - uint[] array4 = { 4279238656, 4278714368, 4279238656, 4279238656, 4278190080, 4278190080, 4278190080, 4278190080, 4278190080 }; - int expected2 = 4; - - int actual1 = LosslessUtils.VectorMismatch(array1, array2, 18); - int actual2 = LosslessUtils.VectorMismatch(array3, array4, 9); - - Assert.Equal(expected1, actual1); - Assert.Equal(expected2, actual2); - } - private static void RunSubtractGreenTest() { uint[] pixelData = @@ -211,9 +193,6 @@ private static void RunPredictor13Test() } } - [Fact] - public void VectorMismatch_Works() => RunVectorMismatchTest(); - [Fact] public void Predictor11_Works() => RunPredictor11Test(); @@ -236,12 +215,7 @@ private static void RunPredictor13Test() public void TransformColorInverse_Works() => RunTransformColorInverseTest(); #if SUPPORTS_RUNTIME_INTRINSICS - [Fact] - public void VectorMismatch_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVectorMismatchTest, HwIntrinsics.AllowAll); - - [Fact] - public void VectorMismatch_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVectorMismatchTest, HwIntrinsics.DisableSSE2); - + [Fact] public void Predictor11_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor11Test, HwIntrinsics.AllowAll); From 2d60b73b140b0ab8d6a85a941b0319c86be32d14 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 25 Nov 2021 13:28:35 +0100 Subject: [PATCH 8/8] Rename size to count, add DebugGuard --- .../Formats/Webp/Lossless/Vp8LHistogram.cs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs index ac8cc0f655..bfb8f40d4a 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/Vp8LHistogram.cs @@ -511,8 +511,12 @@ private static double ExtraCost(Span population, int length) return cost; } - private static void AddVector(Span a, Span b, Span output, int size) + private static void AddVector(Span a, Span b, Span output, int count) { + DebugGuard.MustBeGreaterThanOrEqualTo(a.Length, count, nameof(a.Length)); + DebugGuard.MustBeGreaterThanOrEqualTo(b.Length, count, nameof(b.Length)); + DebugGuard.MustBeGreaterThanOrEqualTo(output.Length, count, nameof(output.Length)); + #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported) { @@ -521,7 +525,7 @@ private static void AddVector(Span a, Span b, Span output, int ref uint outputRef = ref MemoryMarshal.GetReference(output); int i; - for (i = 0; i + 32 <= size; i += 32) + for (i = 0; i + 32 <= count; i += 32) { // Load values. Vector256 a0 = Unsafe.As>(ref Unsafe.Add(ref aRef, i)); @@ -534,14 +538,14 @@ private static void AddVector(Span a, Span b, Span output, int Vector256 b3 = Unsafe.As>(ref Unsafe.Add(ref bRef, i + 24)); // Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But - // that's ok since the histogram values are less than 1<<28 (max picture size). + // that's ok since the histogram values are less than 1<<28 (max picture count). Unsafe.As>(ref Unsafe.Add(ref outputRef, i)) = Avx2.Add(a0, b0); Unsafe.As>(ref Unsafe.Add(ref outputRef, i + 8)) = Avx2.Add(a1, b1); Unsafe.As>(ref Unsafe.Add(ref outputRef, i + 16)) = Avx2.Add(a2, b2); Unsafe.As>(ref Unsafe.Add(ref outputRef, i + 24)) = Avx2.Add(a3, b3); } - for (; i < size; i++) + for (; i < count; i++) { output[i] = a[i] + b[i]; } @@ -549,7 +553,7 @@ private static void AddVector(Span a, Span b, Span output, int else #endif { - for (int i = 0; i < size; i++) + for (int i = 0; i < count; i++) { output[i] = a[i] + b[i]; }