Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ARM version of adler32 #2015

Merged
merged 33 commits into from
Mar 2, 2022
Merged
Changes from 4 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
8db4828
Handle empty XMP profiles. Fix #2012
JimBobSquarePants Feb 18, 2022
5214681
Add ARM version of adler32
brianpopow Feb 18, 2022
ce429e9
Remove serial computation, does not not make sense here
brianpopow Feb 18, 2022
c7b62d8
Fix build error
brianpopow Feb 18, 2022
36420de
Block size is the same for sse and arm
brianpopow Feb 18, 2022
c64078c
Remove unnecessary throw and optimize write.
JimBobSquarePants Feb 19, 2022
c129278
Don't throw for bad min code, just don't decode indices.
JimBobSquarePants Feb 19, 2022
892dbe3
Throw exception, if palette chunk is missing
brianpopow Feb 19, 2022
d98171f
Add tests for missing palette chunk
brianpopow Feb 19, 2022
bcb3035
Merge branch 'main' into bp/missingpalette
brianpopow Feb 19, 2022
d7fec18
Apply suggestions from code review
brianpopow Feb 19, 2022
cfc7847
Use MinBufferSize for ARM
brianpopow Feb 18, 2022
29ddc60
Change error message to "...a palette chunk"
brianpopow Feb 19, 2022
bef5162
Add missing using System.Runtime.InteropServices;
brianpopow Feb 19, 2022
d62391e
Fix bug in storing the results
brianpopow Feb 20, 2022
c3c14e8
Add adler tests with and without intrinsics
brianpopow Feb 20, 2022
9179c11
Add common methods for handling left over for sse and arm
brianpopow Feb 20, 2022
78e5b6c
Throw for corrupt LZW min code. Add test for deferred clear code
JimBobSquarePants Feb 21, 2022
968c5ff
Merge pull request #2020 from SixLabors/bp/missingpalette
JimBobSquarePants Feb 21, 2022
eda0906
Merge branch 'main' into js/fix-2012
JimBobSquarePants Feb 21, 2022
18f7c09
Merge branch 'main' into bp/adlerarm
brianpopow Feb 21, 2022
85a0ac6
Use GifThrowHelper
JimBobSquarePants Feb 21, 2022
6727d6e
Merge pull request #2014 from SixLabors/js/fix-2012
JimBobSquarePants Feb 21, 2022
97d1a4e
Merge remote-tracking branch 'origin/main' into bp/invalidgamma
brianpopow Feb 21, 2022
0fa6085
Throw exception, if gamma chunk does not contain enough data
brianpopow Feb 21, 2022
d76c40a
Ignore invalid gamma chunks
brianpopow Feb 21, 2022
c059656
Merge pull request #2021 from SixLabors/bp/invalidgamma
brianpopow Feb 21, 2022
854ea5d
workaround for #2001 / https://github.com/dotnet/runtime/issues/65466
antonfirsov Feb 21, 2022
5b82c57
Add compiler directives
JimBobSquarePants Feb 22, 2022
de5f661
make GetTotalAvailableMemoryBytes conditional
antonfirsov Feb 22, 2022
7387607
Merge branch 'af/TotalAvailableMemoryBytes-workaround' of https://git…
antonfirsov Feb 22, 2022
3259c94
Merge pull request #2025 from SixLabors/af/TotalAvailableMemoryBytes-…
antonfirsov Feb 22, 2022
bfd7b54
Merge branch 'main' into bp/adlerarm
brianpopow Feb 26, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 163 additions & 24 deletions src/ImageSharp/Compression/Zlib/Adler32.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#if NET5_0_OR_GREATER
using System.Runtime.Intrinsics.Arm;
#endif
#endif

#pragma warning disable IDE0007 // Use implicit type
Expand All @@ -23,14 +26,17 @@ internal static class Adler32
public const uint SeedValue = 1U;

// Largest prime smaller than 65536
private const uint BASE = 65521;
private const uint Base = 65521;

// NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
private const uint NMAX = 5552;
private const uint Nmax = 5552;

#if SUPPORTS_RUNTIME_INTRINSICS
private const int MinBufferSize = 64;

// Data will be processed in blocks of 32 bytes.
private const int BlockSize = 1 << 5;

// The C# compiler emits this as a compile-time constant embedded in the PE file.
private static ReadOnlySpan<byte> Tap1Tap2 => new byte[]
{
Expand Down Expand Up @@ -67,11 +73,14 @@ public static uint Calculate(uint adler, ReadOnlySpan<byte> buffer)
{
return CalculateSse(adler, buffer);
}

return CalculateScalar(adler, buffer);
#else
return CalculateScalar(adler, buffer);
#if NET5_0_OR_GREATER
if (AdvSimd.IsSupported)
{
return CalculateArm(adler, buffer);
}
#endif
#endif
return CalculateScalar(adler, buffer);
}

// Based on https://github.com/chromium/chromium/blob/master/third_party/zlib/adler32_simd.c
Expand All @@ -83,19 +92,17 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan<byte> buffer)
uint s2 = (adler >> 16) & 0xFFFF;

// Process the data in blocks.
const int BLOCK_SIZE = 1 << 5;

uint length = (uint)buffer.Length;
uint blocks = length / BLOCK_SIZE;
length -= blocks * BLOCK_SIZE;
uint blocks = length / BlockSize;
length -= blocks * BlockSize;

int index = 0;
fixed (byte* bufferPtr = buffer)
{
fixed (byte* tapPtr = Tap1Tap2)
{
index += (int)blocks * BLOCK_SIZE;
var localBufferPtr = bufferPtr;
index += (int)blocks * BlockSize;
byte* localBufferPtr = bufferPtr;

// _mm_setr_epi8 on x86
Vector128<sbyte> tap1 = Sse2.LoadVector128((sbyte*)tapPtr);
Expand All @@ -105,7 +112,7 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan<byte> buffer)

while (blocks > 0)
{
uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
uint n = Nmax / BlockSize; /* The NMAX constraint. */
if (n > blocks)
{
n = blocks;
Expand Down Expand Up @@ -138,7 +145,7 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan<byte> buffer)
Vector128<short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32());

localBufferPtr += BLOCK_SIZE;
localBufferPtr += BlockSize;
}
while (--n > 0);

Expand All @@ -158,8 +165,8 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan<byte> buffer)
s2 = v_s2.ToScalar();

// Reduce.
s1 %= BASE;
s2 %= BASE;
s1 %= Base;
s2 %= Base;
}

if (length > 0)
Expand Down Expand Up @@ -192,35 +199,167 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan<byte> buffer)
s2 += s1 += *localBufferPtr++;
}

if (s1 >= BASE)
if (s1 >= Base)
{
s1 -= BASE;
s1 -= Base;
}

s2 %= BASE;
s2 %= Base;
}

return s1 | (s2 << 16);
}
}
}

#if NET5_0_OR_GREATER

// Based on: https://github.com/chromium/chromium/blob/master/third_party/zlib/adler32_simd.c
[MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
private static unsafe uint CalculateArm(uint crc, ReadOnlySpan<byte> buffer)
{
// Split Adler-32 into component sums.
uint s1 = crc & 0xFFFF;
uint s2 = (crc >> 16) & 0xFFFF;
int len = buffer.Length;
int bufferOffset = 0;

// Process the data in blocks.
long blocks = len / BlockSize;
len -= (int)(blocks * BlockSize);
fixed (byte* bufferPtr = buffer)
brianpopow marked this conversation as resolved.
Show resolved Hide resolved
{
while (blocks != 0)
{
uint n = Nmax / BlockSize;
if (n > blocks)
{
n = (uint)blocks;
}

blocks -= n;

// Process n blocks of data. At most nMax data bytes can be
// processed before s2 must be reduced modulo Base.
var vs2 = Vector128.Create(0, 0, 0, s1 * n);
Vector128<uint> vs1 = Vector128<uint>.Zero;
brianpopow marked this conversation as resolved.
Show resolved Hide resolved
Vector128<ushort> vColumnSum1 = Vector128<ushort>.Zero;
Vector128<ushort> vColumnSum2 = Vector128<ushort>.Zero;
Vector128<ushort> vColumnSum3 = Vector128<ushort>.Zero;
Vector128<ushort> vColumnSum4 = Vector128<ushort>.Zero;

do
{
// Load 32 input bytes.
Vector128<ushort> bytes1 = AdvSimd.LoadVector128(bufferPtr + bufferOffset).AsUInt16();
Vector128<ushort> bytes2 = AdvSimd.LoadVector128(bufferPtr + bufferOffset + 16).AsUInt16();

// Add previous block byte sum to v_s2.
vs2 = AdvSimd.Add(vs2, vs1);

// Horizontally add the bytes for s1.
vs1 = AdvSimd.AddPairwiseWideningAndAdd(
vs1.AsUInt32(),
AdvSimd.AddPairwiseWideningAndAdd(AdvSimd.AddPairwiseWidening(bytes1.AsByte()).AsUInt16(), bytes2.AsByte()));

// Vertically add the bytes for s2.
vColumnSum1 = AdvSimd.AddWideningLower(vColumnSum1, bytes1.GetLower().AsByte());
vColumnSum2 = AdvSimd.AddWideningLower(vColumnSum2, bytes1.GetUpper().AsByte());
vColumnSum3 = AdvSimd.AddWideningLower(vColumnSum3, bytes2.GetLower().AsByte());
vColumnSum4 = AdvSimd.AddWideningLower(vColumnSum4, bytes2.GetUpper().AsByte());

bufferOffset += BlockSize;
} while (--n > 0);

vs2 = AdvSimd.ShiftLeftLogical(vs2, 5);

// Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum1.GetLower(), Vector64.Create((ushort)32, 31, 30, 29));
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum1.GetUpper(), Vector64.Create((ushort)28, 27, 26, 25));
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum2.GetLower(), Vector64.Create((ushort)24, 23, 22, 21));
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum2.GetUpper(), Vector64.Create((ushort)20, 19, 18, 17));
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum3.GetLower(), Vector64.Create((ushort)16, 15, 14, 13));
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum3.GetUpper(), Vector64.Create((ushort)12, 11, 10, 9));
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum4.GetLower(), Vector64.Create((ushort)8, 7, 6, 5));
vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum4.GetUpper(), Vector64.Create((ushort)4, 3, 2, 1));

// Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
Vector64<uint> sum1 = AdvSimd.AddPairwise(vs1.GetLower(), vs1.GetUpper());
Vector64<uint> sum2 = AdvSimd.AddPairwise(vs2.GetLower(), vs2.GetUpper());
Vector64<uint> s1s2 = AdvSimd.AddPairwise(sum1, sum2);

// Store the results.
s1 = AdvSimd.Extract(s1s2, 0);
s2 = AdvSimd.Extract(s1s2, 1);

// Reduce.
s1 %= Base;
s2 %= Base;
}
}

// Handle leftover data.
if (len != 0)
{
if (len >= 16)
{
s2 += s1 += buffer[bufferOffset++];
s2 += s1 += buffer[bufferOffset++];
s2 += s1 += buffer[bufferOffset++];
s2 += s1 += buffer[bufferOffset++];

s2 += s1 += buffer[bufferOffset++];
s2 += s1 += buffer[bufferOffset++];
s2 += s1 += buffer[bufferOffset++];
s2 += s1 += buffer[bufferOffset++];

s2 += s1 += buffer[bufferOffset++];
s2 += s1 += buffer[bufferOffset++];
s2 += s1 += buffer[bufferOffset++];
s2 += s1 += buffer[bufferOffset++];

s2 += s1 += buffer[bufferOffset++];
s2 += s1 += buffer[bufferOffset++];
s2 += s1 += buffer[bufferOffset++];
s2 += s1 += buffer[bufferOffset++];

len -= 16;
}

while (len-- > 0)
{
s2 += s1 += buffer[bufferOffset++];
}

if (s1 >= Base)
{
s1 -= Base;
}

s2 %= Base;
}

// Return the recombined sums.
return s1 | (s2 << 16);
}

#endif
#endif

[MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
private static unsafe uint CalculateScalar(uint adler, ReadOnlySpan<byte> buffer)
{
uint s1 = adler & 0xFFFF;
uint s2 = (adler >> 16) & 0xFFFF;
uint k;

fixed (byte* bufferPtr = buffer)
brianpopow marked this conversation as resolved.
Show resolved Hide resolved
{
var localBufferPtr = bufferPtr;
byte* localBufferPtr = bufferPtr;
uint length = (uint)buffer.Length;

while (length > 0)
{
k = length < NMAX ? length : NMAX;
var k = length < Nmax ? length : Nmax;
length -= k;

while (k >= 16)
Expand Down Expand Up @@ -251,8 +390,8 @@ private static unsafe uint CalculateScalar(uint adler, ReadOnlySpan<byte> buffer
s2 += s1 += *localBufferPtr++;
}

s1 %= BASE;
s2 %= BASE;
s1 %= Base;
s2 %= Base;
}

return (s2 << 16) | s1;
Expand Down