-
-
Notifications
You must be signed in to change notification settings - Fork 855
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Avx backed Block8x8F Transpose method #1374
Changes from 4 commits
24d49e5
7a55662
3b2ade5
093fbc4
8e5a59f
6dae52e
9c648d7
97c1846
e33c1cd
685693a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -6,6 +6,10 @@ | |||||||||||||||||
using System.Numerics; | ||||||||||||||||||
using System.Runtime.CompilerServices; | ||||||||||||||||||
using System.Runtime.InteropServices; | ||||||||||||||||||
#if SUPPORTS_RUNTIME_INTRINSICS | ||||||||||||||||||
using System.Runtime.Intrinsics; | ||||||||||||||||||
using System.Runtime.Intrinsics.X86; | ||||||||||||||||||
#endif | ||||||||||||||||||
using System.Text; | ||||||||||||||||||
|
||||||||||||||||||
// ReSharper disable InconsistentNaming | ||||||||||||||||||
|
@@ -596,5 +600,98 @@ private static void GuardBlockIndex(int idx) | |||||||||||||||||
DebugGuard.MustBeLessThan(idx, Size, nameof(idx)); | ||||||||||||||||||
DebugGuard.MustBeGreaterThanOrEqualTo(idx, 0, nameof(idx)); | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
/// <summary> | ||||||||||||||||||
/// Transpose the block into the destination block. | ||||||||||||||||||
/// </summary> | ||||||||||||||||||
/// <param name="d">The destination block</param> | ||||||||||||||||||
[MethodImpl(InliningOptions.ShortMethod)] | ||||||||||||||||||
public void TransposeInto(ref Block8x8F d) | ||||||||||||||||||
{ | ||||||||||||||||||
#if SUPPORTS_RUNTIME_INTRINSICS | ||||||||||||||||||
if (Avx.IsSupported) | ||||||||||||||||||
{ | ||||||||||||||||||
this.TransposeIntoAvx(ref d); | ||||||||||||||||||
} | ||||||||||||||||||
else | ||||||||||||||||||
#endif | ||||||||||||||||||
{ | ||||||||||||||||||
this.TransposeIntoFallback(ref d); | ||||||||||||||||||
} | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
#if SUPPORTS_RUNTIME_INTRINSICS | ||||||||||||||||||
/// <summary> | ||||||||||||||||||
/// AVX-only variant for executing <see cref="TransposeInto(ref Block8x8F)"/>. | ||||||||||||||||||
/// <see href="https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536"/> | ||||||||||||||||||
/// </summary> | ||||||||||||||||||
[MethodImpl(InliningOptions.ShortMethod)] | ||||||||||||||||||
public void TransposeIntoAvx(ref Block8x8F d) | ||||||||||||||||||
{ | ||||||||||||||||||
Vector256<float> r0 = Avx.InsertVector128( | ||||||||||||||||||
Unsafe.As<Vector4, Vector128<float>>(ref this.V0L).ToVector256(), | ||||||||||||||||||
Unsafe.As<Vector4, Vector128<float>>(ref this.V4L), | ||||||||||||||||||
1); | ||||||||||||||||||
|
||||||||||||||||||
Vector256<float> r1 = Avx.InsertVector128( | ||||||||||||||||||
Unsafe.As<Vector4, Vector128<float>>(ref this.V1L).ToVector256(), | ||||||||||||||||||
Unsafe.As<Vector4, Vector128<float>>(ref this.V5L), | ||||||||||||||||||
1); | ||||||||||||||||||
|
||||||||||||||||||
Vector256<float> r2 = Avx.InsertVector128( | ||||||||||||||||||
Unsafe.As<Vector4, Vector128<float>>(ref this.V2L).ToVector256(), | ||||||||||||||||||
Unsafe.As<Vector4, Vector128<float>>(ref this.V6L), | ||||||||||||||||||
1); | ||||||||||||||||||
|
||||||||||||||||||
Vector256<float> r3 = Avx.InsertVector128( | ||||||||||||||||||
Unsafe.As<Vector4, Vector128<float>>(ref this.V3L).ToVector256(), | ||||||||||||||||||
Unsafe.As<Vector4, Vector128<float>>(ref this.V7L), | ||||||||||||||||||
1); | ||||||||||||||||||
|
||||||||||||||||||
Vector256<float> r4 = Avx.InsertVector128( | ||||||||||||||||||
Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(), | ||||||||||||||||||
Unsafe.As<Vector4, Vector128<float>>(ref this.V4R), | ||||||||||||||||||
1); | ||||||||||||||||||
|
||||||||||||||||||
Vector256<float> r5 = Avx.InsertVector128( | ||||||||||||||||||
Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(), | ||||||||||||||||||
Unsafe.As<Vector4, Vector128<float>>(ref this.V5R), | ||||||||||||||||||
1); | ||||||||||||||||||
|
||||||||||||||||||
Vector256<float> r6 = Avx.InsertVector128( | ||||||||||||||||||
Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(), | ||||||||||||||||||
Unsafe.As<Vector4, Vector128<float>>(ref this.V6R), | ||||||||||||||||||
1); | ||||||||||||||||||
|
||||||||||||||||||
Vector256<float> r7 = Avx.InsertVector128( | ||||||||||||||||||
Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(), | ||||||||||||||||||
Unsafe.As<Vector4, Vector128<float>>(ref this.V7R), | ||||||||||||||||||
1); | ||||||||||||||||||
|
||||||||||||||||||
Vector256<float> t0 = Avx.UnpackLow(r0, r1); | ||||||||||||||||||
Vector256<float> t2 = Avx.UnpackLow(r2, r3); | ||||||||||||||||||
Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E); | ||||||||||||||||||
Unsafe.As<Vector4, Vector256<float>>(ref d.V0L) = Avx.Blend(t0, v, 0xCC); | ||||||||||||||||||
Unsafe.As<Vector4, Vector256<float>>(ref d.V1L) = Avx.Blend(t2, v, 0x33); | ||||||||||||||||||
|
||||||||||||||||||
Vector256<float> t4 = Avx.UnpackLow(r4, r5); | ||||||||||||||||||
Vector256<float> t6 = Avx.UnpackLow(r6, r7); | ||||||||||||||||||
v = Avx.Shuffle(t4, t6, 0x4E); | ||||||||||||||||||
Unsafe.As<Vector4, Vector256<float>>(ref d.V4L) = Avx.Blend(t4, v, 0xCC); | ||||||||||||||||||
Unsafe.As<Vector4, Vector256<float>>(ref d.V5L) = Avx.Blend(t6, v, 0x33); | ||||||||||||||||||
|
||||||||||||||||||
Vector256<float> t1 = Avx.UnpackHigh(r0, r1); | ||||||||||||||||||
Vector256<float> t3 = Avx.UnpackHigh(r2, r3); | ||||||||||||||||||
v = Avx.Shuffle(t1, t3, 0x4E); | ||||||||||||||||||
Unsafe.As<Vector4, Vector256<float>>(ref d.V2L) = Avx.Blend(t1, v, 0xCC); | ||||||||||||||||||
Unsafe.As<Vector4, Vector256<float>>(ref d.V3L) = Avx.Blend(t3, v, 0x33); | ||||||||||||||||||
|
||||||||||||||||||
Vector256<float> t5 = Avx.UnpackHigh(r4, r5); | ||||||||||||||||||
Vector256<float> t7 = Avx.UnpackHigh(r6, r7); | ||||||||||||||||||
v = Avx.Shuffle(t5, t7, 0x4E); | ||||||||||||||||||
Unsafe.As<Vector4, Vector256<float>>(ref d.V6L) = Avx.Blend(t5, v, 0xCC); | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @tannergooding can we rely on codegen producing optimal assembly when working with We wanted to avoid pinning, if possible. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would also like to subscribe to this newsletter. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should still fold the loads/stores. I believe there are a couple edge cases where we may generate an additional There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a new language feature I am unaware of that allows this without pinning? https://source.dot.net/#System.Private.CoreLib/Matrix4x4.cs,273 Vector128<float> M11 = AdvSimd.LoadVector128(&value1.M11); There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually: ImageSharp/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs Lines 80 to 86 in 78a584e
And that struct always lives on the stack: ImageSharp/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegComponentPostProcessor.cs Line 79 in 78a584e
Therefore, you can get a pointer to the whole float* dPtr = (float*)Unsafe.AsPointer(ref d); There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah yes of course. This all makes sense thanks! I might give the pointer approach a try tomorrow just to see if there is any difference. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Decided against it. I don't want to risk changes to external code breaking this. |
||||||||||||||||||
Unsafe.As<Vector4, Vector256<float>>(ref d.V7L) = Avx.Blend(t7, v, 0x33); | ||||||||||||||||||
} | ||||||||||||||||||
#endif | ||||||||||||||||||
} | ||||||||||||||||||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
// Copyright (c) Six Labors. | ||
// Licensed under the Apache License, Version 2.0. | ||
|
||
using BenchmarkDotNet.Attributes; | ||
using SixLabors.ImageSharp.Formats.Jpeg.Components; | ||
|
||
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations | ||
{ | ||
public class Block8x8F_Transpose | ||
{ | ||
private static readonly Block8x8F Source = Create8x8FloatData(); | ||
|
||
[Benchmark(Baseline=true)] | ||
public void TransposeIntoVector4() | ||
{ | ||
var dest = default(Block8x8F); | ||
Source.TransposeIntoFallback(ref dest); | ||
} | ||
|
||
#if SUPPORTS_RUNTIME_INTRINSICS | ||
[Benchmark] | ||
public void TransposeIntoAvx() | ||
{ | ||
var dest = default(Block8x8F); | ||
Source.TransposeIntoAvx(ref dest); | ||
} | ||
#endif | ||
|
||
private static Block8x8F Create8x8FloatData() | ||
{ | ||
var result = new float[64]; | ||
for (int i = 0; i < 8; i++) | ||
{ | ||
for (int j = 0; j < 8; j++) | ||
{ | ||
result[(i * 8) + j] = (i * 10) + j; | ||
} | ||
} | ||
|
||
var source = default(Block8x8F); | ||
source.LoadFrom(result); | ||
return source; | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -172,14 +172,34 @@ public void TransposeInto() | |
source.LoadFrom(Create8x8FloatData()); | ||
|
||
var dest = default(Block8x8F); | ||
source.TransposeInto(ref dest); | ||
source.TransposeIntoFallback(ref dest); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's rename the test method as well. |
||
|
||
float[] actual = new float[64]; | ||
dest.ScaledCopyTo(actual); | ||
|
||
Assert.Equal(expected, actual); | ||
} | ||
|
||
#if SUPPORTS_RUNTIME_INTRINSICS | ||
[Fact] | ||
public void TransposeIntoAvx() | ||
{ | ||
float[] expected = Create8x8FloatData(); | ||
ReferenceImplementations.Transpose8x8(expected); | ||
|
||
var source = default(Block8x8F); | ||
source.LoadFrom(Create8x8FloatData()); | ||
|
||
var dest = default(Block8x8F); | ||
source.TransposeIntoAvx(ref dest); | ||
|
||
float[] actual = new float[64]; | ||
dest.ScaledCopyTo(actual); | ||
|
||
Assert.Equal(expected, actual); | ||
} | ||
#endif | ||
|
||
private class BufferHolder | ||
{ | ||
public Block8x8F Buffer; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This
doesn't changeonly slightly changes the assembly that is generated (vmovups
vsvmovupd
- see comment below) but it might show the intention better as:There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can anyone explain the difference in the precision of the output instruction here?
vmovups
tovmovupd
atL0006
Original
Suggested Change
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could be error but I spotted a slight slow down in the change. Keeping as-is for now.