Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vectorize Jpeg Encoder Color Conversion #1508

Merged
merged 5 commits into from
Jan 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.

using System;
using System.Runtime.CompilerServices;
using SixLabors.ImageSharp.PixelFormats;

namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
{
/// <summary>
/// Provides 8-bit lookup tables for converting from Rgb to YCbCr colorspace.
/// Methods to build the tables are based on libjpeg implementation.
/// TODO: Replace this logic with SIMD conversion (similar to the one in the decoder)!
/// </summary>
internal unsafe struct RgbToYCbCrTables
internal unsafe struct RgbToYCbCrConverterLut
{
/// <summary>
/// The red luminance table
Expand Down Expand Up @@ -63,10 +64,10 @@ internal unsafe struct RgbToYCbCrTables
/// <summary>
/// Initializes the YCbCr tables
/// </summary>
/// <returns>The initialized <see cref="RgbToYCbCrTables"/></returns>
public static RgbToYCbCrTables Create()
/// <returns>The initialized <see cref="RgbToYCbCrConverterLut"/></returns>
public static RgbToYCbCrConverterLut Create()
JimBobSquarePants marked this conversation as resolved.
Show resolved Hide resolved
{
RgbToYCbCrTables tables = default;
RgbToYCbCrConverterLut tables = default;

for (int i = 0; i <= 255; i++)
{
Expand All @@ -92,11 +93,10 @@ public static RgbToYCbCrTables Create()
}

/// <summary>
/// TODO: Replace this logic with SIMD conversion (similar to the one in the decoder)!
/// Optimized method to allocates the correct y, cb, and cr values to the DCT blocks from the given r, g, b values.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void ConvertPixelInto(
private void ConvertPixelInto(
int r,
int g,
int b,
Expand All @@ -111,10 +111,29 @@ public void ConvertPixelInto(
// float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
cbResult[i] = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits;

// float cr = MathF.Round(y + (1.772F * cb), MidpointRounding.AwayFromZero);
// float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
}

public void Convert(Span<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
{
ref Rgb24 rgbStart = ref rgbSpan[0];

for (int i = 0; i < 64; i++)
{
ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i);

this.ConvertPixelInto(
c.R,
c.G,
c.B,
ref yBlock,
ref cbBlock,
ref crBlock,
i);
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Fix(float x)
=> (int)((x * (1L << ScaleBits)) + 0.5F);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.

using System;
using System.Diagnostics;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif
using SixLabors.ImageSharp.PixelFormats;

namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
{
internal static class RgbToYCbCrConverterVectorized
{
public static bool IsSupported
{
get
{
#if SUPPORTS_RUNTIME_INTRINSICS
return Avx2.IsSupported;
#else
return false;
#endif
}
}

#if SUPPORTS_RUNTIME_INTRINSICS
private static ReadOnlySpan<byte> MoveFirst24BytesToSeparateLanes => new byte[]
{
0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0,
3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0
};

private static ReadOnlySpan<byte> MoveLast24BytesToSeparateLanes => new byte[]
{
2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0,
5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0
};

private static ReadOnlySpan<byte> ExtractRgb => new byte[]
{
0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF,
0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF
};
#endif

public static void Convert(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
{
Debug.Assert(IsSupported, "AVX2 is required to run this converter");

#if SUPPORTS_RUNTIME_INTRINSICS
var f0299 = Vector256.Create(0.299f);
var f0587 = Vector256.Create(0.587f);
var f0114 = Vector256.Create(0.114f);
var fn0168736 = Vector256.Create(-0.168736f);
var fn0331264 = Vector256.Create(-0.331264f);
var f128 = Vector256.Create(128f);
var fn0418688 = Vector256.Create(-0.418688f);
var fn0081312F = Vector256.Create(-0.081312F);
var f05 = Vector256.Create(0.5f);
var zero = Vector256.Create(0).AsByte();

ref Vector256<byte> inRef = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
ref Vector256<float> destYRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref yBlock);
ref Vector256<float> destCbRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref cbBlock);
ref Vector256<float> destCrRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref crBlock);

var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
Vector256<byte> rgb, rg, bx;
Vector256<float> r, g, b;
for (int i = 0; i < 7; i++)
{
rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)(24 * i)).AsUInt32(), extractToLanesMask).AsByte();

rgb = Avx2.Shuffle(rgb, extractRgbMask);

rg = Avx2.UnpackLow(rgb, zero);
bx = Avx2.UnpackHigh(rgb, zero);

r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());

// (0.299F * r) + (0.587F * g) + (0.114F * b);
Unsafe.Add(ref destYRef, i) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);

// 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));

// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
}

extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveLast24BytesToSeparateLanes));
rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)160).AsUInt32(), extractToLanesMask).AsByte();
rgb = Avx2.Shuffle(rgb, extractRgbMask);

rg = Avx2.UnpackLow(rgb, zero);
bx = Avx2.UnpackHigh(rgb, zero);

r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());

// (0.299F * r) + (0.587F * g) + (0.114F * b);
Unsafe.Add(ref destYRef, 7) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);

// 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
Unsafe.Add(ref destCbRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));

// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
Unsafe.Add(ref destCrRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
#endif
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
// Licensed under the Apache License, Version 2.0.

using System;
using System.Runtime.CompilerServices;
using SixLabors.ImageSharp.Advanced;
using SixLabors.ImageSharp.PixelFormats;

Expand Down Expand Up @@ -33,7 +32,7 @@ internal ref struct YCbCrForwardConverter<TPixel>
/// <summary>
/// The color conversion tables
/// </summary>
private RgbToYCbCrTables colorTables;
private RgbToYCbCrConverterLut colorTables;

/// <summary>
/// Temporal 8x8 block to hold TPixel data
Expand All @@ -48,7 +47,12 @@ internal ref struct YCbCrForwardConverter<TPixel>
public static YCbCrForwardConverter<TPixel> Create()
{
var result = default(YCbCrForwardConverter<TPixel>);
result.colorTables = RgbToYCbCrTables.Create();
if (!RgbToYCbCrConverterVectorized.IsSupported)
{
// Avoid creating lookup tables, when vectorized converter is supported
result.colorTables = RgbToYCbCrConverterLut.Create();
}

return result;
}

Expand All @@ -65,20 +69,14 @@ public void Convert(ImageFrame<TPixel> frame, int x, int y, in RowOctet<TPixel>
ref Block8x8F yBlock = ref this.Y;
ref Block8x8F cbBlock = ref this.Cb;
ref Block8x8F crBlock = ref this.Cr;
ref Rgb24 rgbStart = ref rgbSpan[0];

for (int i = 0; i < 64; i++)
if (RgbToYCbCrConverterVectorized.IsSupported)
{
ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i);

this.colorTables.ConvertPixelInto(
c.R,
c.G,
c.B,
ref yBlock,
ref cbBlock,
ref crBlock,
i);
RgbToYCbCrConverterVectorized.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
}
else
{
this.colorTables.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.

using System;
using BenchmarkDotNet.Attributes;
using SixLabors.ImageSharp.Formats.Jpeg.Components;
using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder;
using SixLabors.ImageSharp.PixelFormats;

namespace SixLabors.ImageSharp.Benchmarks.Format.Jpeg.Components.Encoder
{
public class YCbCrForwardConverterBenchmark
{
private RgbToYCbCrConverterLut converter;
private Rgb24[] data;

[GlobalSetup]
public void Setup()
{
this.converter = RgbToYCbCrConverterLut.Create();

var r = new Random(42);
this.data = new Rgb24[64];

var d = new byte[3];
for (int i = 0; i < this.data.Length; i++)
{
r.NextBytes(d);
this.data[i] = new Rgb24(d[0], d[1], d[2]);
}
}

[Benchmark(Baseline = true)]
public void ConvertLut()
{
Block8x8F y = default;
Block8x8F cb = default;
Block8x8F cr = default;

this.converter.Convert(this.data.AsSpan(), ref y, ref cb, ref cr);
}

[Benchmark]
public void ConvertVectorized()
{
Block8x8F y = default;
Block8x8F cb = default;
Block8x8F cr = default;

if (RgbToYCbCrConverterVectorized.IsSupported)
{
RgbToYCbCrConverterVectorized.Convert(this.data.AsSpan(), ref y, ref cb, ref cr);
}
}
}
}
Loading