Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Avx backed Block8x8F Transpose method #1374

Merged
merged 10 commits into from
Oct 12, 2020
6 changes: 3 additions & 3 deletions src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
{
internal partial struct Block8x8F
{
/// <summary>
/// Transpose the block into the destination block.
/// <summary>
/// Fallback method to transpose a block into the destination block on non AVX supported CPUs.
/// </summary>
/// <param name="d">The destination block</param>
[MethodImpl(InliningOptions.ShortMethod)]
public void TransposeInto(ref Block8x8F d)
public void TransposeIntoFallback(ref Block8x8F d)
{
d.V0L.X = V0L.X;
d.V1L.X = V0L.Y;
Expand Down
6 changes: 3 additions & 3 deletions src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
{
internal partial struct Block8x8F
{
/// <summary>
/// Transpose the block into the destination block.
/// <summary>
/// Fallback method to transpose a block into the destination block on non AVX supported CPUs.
/// </summary>
/// <param name="d">The destination block</param>
[MethodImpl(InliningOptions.ShortMethod)]
public void TransposeInto(ref Block8x8F d)
public void TransposeIntoFallback(ref Block8x8F d)
{
<#
PushIndent(" ");
Expand Down
97 changes: 97 additions & 0 deletions src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif
using System.Text;

// ReSharper disable InconsistentNaming
Expand Down Expand Up @@ -596,5 +600,98 @@ private static void GuardBlockIndex(int idx)
DebugGuard.MustBeLessThan(idx, Size, nameof(idx));
DebugGuard.MustBeGreaterThanOrEqualTo(idx, 0, nameof(idx));
}

/// <summary>
/// Transpose the block into the destination block.
/// </summary>
/// <param name="d">The destination block</param>
[MethodImpl(InliningOptions.ShortMethod)]
public void TransposeInto(ref Block8x8F d)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx.IsSupported)
{
this.TransposeIntoAvx(ref d);
}
else
#endif
{
this.TransposeIntoFallback(ref d);
}
}

#if SUPPORTS_RUNTIME_INTRINSICS
/// <summary>
/// AVX-only variant for executing <see cref="TransposeInto(ref Block8x8F)"/>.
/// <see href="https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536"/>
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
public void TransposeIntoAvx(ref Block8x8F d)
{
Vector256<float> r0 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V0L).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
1);
Comment on lines +631 to +634
Copy link
Contributor

@Turnerj Turnerj Oct 8, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't change only slightly changes the assembly that is generated (vmovups vs vmovupd - see comment below) but it might show the intention better as:

Suggested change
Vector256<float> r0 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V0L).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
1);
Vector256<float> r0 = Vector256.Create(
Unsafe.As<Vector4, Vector128<float>>(ref this.V0L),
Unsafe.As<Vector4, Vector128<float>>(ref this.V4L)
);

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can anyone explain the difference in the precision of the output instruction here?

vmovups to vmovupd at L0006

Original

Block8x8F.TransposeIntoAvx(Block8x8F ByRef)
    L0000: sub esp, 0x20
    L0003: vzeroupper
    L0006: vmovups xmm0, [ecx]
    L000a: add ecx, 0x80
    L0010: vmovupd xmm1, [ecx]
    L0014: vinsertf128 ymm0, ymm0, xmm1, 1
    L001a: vmovupd [esp], ymm0
    L001f: vzeroupper
    L0022: add esp, 0x20
    L0025: ret

Suggested Change

Block8x8F.TransposeIntoAvx(Block8x8F ByRef)
    L0000: sub esp, 0x20
    L0003: vzeroupper
    L0006: vmovupd xmm0, [ecx]
    L000a: add ecx, 0x80
    L0010: vmovupd xmm1, [ecx]
    L0014: vinsertf128 ymm0, ymm0, xmm1, 1
    L001a: vmovupd [esp], ymm0
    L001f: vzeroupper
    L0022: add esp, 0x20
    L0025: ret

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could be error but I spotted a slight slow down in the change. Keeping as-is for now.


Vector256<float> r1 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V1L).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
1);

Vector256<float> r2 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V2L).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
1);

Vector256<float> r3 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V3L).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
1);

Vector256<float> r4 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
1);

Vector256<float> r5 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
1);

Vector256<float> r6 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
1);

Vector256<float> r7 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
1);

Vector256<float> t0 = Avx.UnpackLow(r0, r1);
Vector256<float> t2 = Avx.UnpackLow(r2, r3);
Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
Unsafe.As<Vector4, Vector256<float>>(ref d.V0L) = Avx.Blend(t0, v, 0xCC);
Unsafe.As<Vector4, Vector256<float>>(ref d.V1L) = Avx.Blend(t2, v, 0x33);

Vector256<float> t4 = Avx.UnpackLow(r4, r5);
Vector256<float> t6 = Avx.UnpackLow(r6, r7);
v = Avx.Shuffle(t4, t6, 0x4E);
Unsafe.As<Vector4, Vector256<float>>(ref d.V4L) = Avx.Blend(t4, v, 0xCC);
Unsafe.As<Vector4, Vector256<float>>(ref d.V5L) = Avx.Blend(t6, v, 0x33);

Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
v = Avx.Shuffle(t1, t3, 0x4E);
Unsafe.As<Vector4, Vector256<float>>(ref d.V2L) = Avx.Blend(t1, v, 0xCC);
Unsafe.As<Vector4, Vector256<float>>(ref d.V3L) = Avx.Blend(t3, v, 0x33);

Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
v = Avx.Shuffle(t5, t7, 0x4E);
Unsafe.As<Vector4, Vector256<float>>(ref d.V6L) = Avx.Blend(t5, v, 0xCC);
Copy link
Member

@antonfirsov antonfirsov Oct 7, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tannergooding can we rely on codegen producing optimal assembly when working with Unsafe.As? (In comparison to Store / LoadVector256 )

We wanted to avoid pinning, if possible.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would also like to subscribe to this newsletter.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should still fold the loads/stores. I believe there are a couple edge cases where we may generate an additional lea instruction, but the disassembly would nerd to be checked to see if it's an issue.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a new language feature I am unaware of that allows this without pinning?

https://source.dot.net/#System.Private.CoreLib/Matrix4x4.cs,273

Vector128<float> M11 = AdvSimd.LoadVector128(&value1.M11);

Copy link
Member

@antonfirsov antonfirsov Oct 7, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case Matrix4x4 is passed by value to the method, so the compiler knows for sure that value1 is living on the stack, which means it never gets moved around by the GC. => the address can be used safely without pinning anything.

Copy link
Member

@antonfirsov antonfirsov Oct 7, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually: JpegBlockPostprocessor is the holder of all 3 blocks passed to TransformIDCT:

ref Block8x8F b = ref this.SourceBlock;
b.LoadFrom(ref sourceBlock);
// Dequantize:
b.MultiplyInplace(ref this.DequantiazationTable);
FastFloatingPointDCT.TransformIDCT(ref b, ref this.WorkspaceBlock1, ref this.WorkspaceBlock2);

And that struct always lives on the stack:

var blockPp = new JpegBlockPostProcessor(this.ImagePostProcessor.RawJpeg, this.Component);

Therefore, you can get a pointer to the whole Block8x8 buffer at the beginning of the method "safely" with Unsafe:

float* dPtr = (float*)Unsafe.AsPointer(ref d);

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yes of course. This all makes sense thanks!

I might give the pointer approach a try tomorrow just to see if there is any difference.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Decided against it. I don't want to risk changes to external code breaking this.

Unsafe.As<Vector4, Vector256<float>>(ref d.V7L) = Avx.Blend(t7, v, 0x33);
}
#endif
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.

using System.Numerics;
Expand Down Expand Up @@ -50,8 +50,6 @@ internal static class FastFloatingPointDCT
/// <param name="temp">Temporary block provided by the caller</param>
public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp)
{
// TODO: Transpose is a bottleneck now. We need full AVX support to optimize it:
// https://github.com/dotnet/corefx/issues/22940
src.TransposeInto(ref temp);

IDCT8x4_LeftPart(ref temp, ref dest);
Expand Down Expand Up @@ -340,4 +338,4 @@ public static void TransformFDCT(
dest.MultiplyInplace(C_0_125);
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.

using BenchmarkDotNet.Attributes;
using SixLabors.ImageSharp.Formats.Jpeg.Components;

namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
{
public class Block8x8F_Transpose
{
private static readonly Block8x8F Source = Create8x8FloatData();

[Benchmark(Baseline=true)]
public void TransposeIntoVector4()
{
var dest = default(Block8x8F);
Source.TransposeIntoFallback(ref dest);
}

#if SUPPORTS_RUNTIME_INTRINSICS
[Benchmark]
public void TransposeIntoAvx()
{
var dest = default(Block8x8F);
Source.TransposeIntoAvx(ref dest);
}
#endif

private static Block8x8F Create8x8FloatData()
{
var result = new float[64];
for (int i = 0; i < 8; i++)
{
for (int j = 0; j < 8; j++)
{
result[(i * 8) + j] = (i * 10) + j;
}
}

var source = default(Block8x8F);
source.LoadFrom(result);
return source;
}
}
}
22 changes: 21 additions & 1 deletion tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -172,14 +172,34 @@ public void TransposeInto()
source.LoadFrom(Create8x8FloatData());

var dest = default(Block8x8F);
source.TransposeInto(ref dest);
source.TransposeIntoFallback(ref dest);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's rename the test method as well.


float[] actual = new float[64];
dest.ScaledCopyTo(actual);

Assert.Equal(expected, actual);
}

#if SUPPORTS_RUNTIME_INTRINSICS
[Fact]
public void TransposeIntoAvx()
{
float[] expected = Create8x8FloatData();
ReferenceImplementations.Transpose8x8(expected);

var source = default(Block8x8F);
source.LoadFrom(Create8x8FloatData());

var dest = default(Block8x8F);
source.TransposeIntoAvx(ref dest);

float[] actual = new float[64];
dest.ScaledCopyTo(actual);

Assert.Equal(expected, actual);
}
#endif

private class BufferHolder
{
public Block8x8F Buffer;
Expand Down