diff --git a/src/ImageSharp/Common/Extensions/SimdUtils.cs b/src/ImageSharp/Common/Extensions/SimdUtils.cs
deleted file mode 100644
index 7b77fefcac..0000000000
--- a/src/ImageSharp/Common/Extensions/SimdUtils.cs
+++ /dev/null
@@ -1,232 +0,0 @@
-﻿// Copyright (c) Six Labors and contributors.
-// Licensed under the Apache License, Version 2.0.
-
-using System;
-using System.Diagnostics;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-
-namespace SixLabors.ImageSharp
-{
-    /// <summary>
-    /// Various extension and utility methods for <see cref="Vector4"/> and <see cref="Vector{T}"/> utilizing SIMD capabilities
-    /// </summary>
-    internal static class SimdUtils
-    {
-        /// <summary>
-        /// Gets a value indicating whether the code is being executed on AVX2 CPU where both float and integer registers are of size 256 byte.
-        /// </summary>
-        public static bool IsAvx2CompatibleArchitecture => Vector<float>.Count == 8 && Vector<int>.Count == 8;
-
-        internal static void GuardAvx2(string operation)
-        {
-            if (!IsAvx2CompatibleArchitecture)
-            {
-                throw new NotSupportedException($"{operation} is supported only on AVX2 CPU!");
-            }
-        }
-
-        /// <summary>
-        /// Transform all scalars in 'v' in a way that converting them to <see cref="int"/> would have rounding semantics.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static Vector4 PseudoRound(this Vector4 v)
-        {
-            var sign = Vector4.Clamp(v, new Vector4(-1), new Vector4(1));
-
-            return v + (sign * 0.5f);
-        }
-
-        /// <summary>
-        /// Rounds all values in 'v' to the nearest integer following <see cref="MidpointRounding.ToEven"/> semantics.
-        /// Source:
-        /// <see>
-        ///     <cref>https://github.com/g-truc/glm/blob/master/glm/simd/common.h#L110</cref>
-        /// </see>
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static Vector<float> FastRound(this Vector<float> x)
-        {
-            Vector<int> magic0 = new Vector<int>(int.MinValue); // 0x80000000
-            Vector<float> sgn0 = Vector.AsVectorSingle(magic0);
-            Vector<float> and0 = Vector.BitwiseAnd(sgn0, x);
-            Vector<float> or0 = Vector.BitwiseOr(and0, new Vector<float>(8388608.0f));
-            Vector<float> add0 = Vector.Add(x, or0);
-            Vector<float> sub0 = Vector.Subtract(add0, or0);
-            return sub0;
-        }
-
-        /// <summary>
-        /// Convert 'source.Length' <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/> values.
-        /// The values gonna be scaled up into [0-255] and rounded.
-        /// Based on:
-        /// <see>
-        ///     <cref>http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions</cref>
-        /// </see>
-        /// </summary>
-        internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan<float> source, Span<byte> dest)
-        {
-            GuardAvx2(nameof(BulkConvertNormalizedFloatToByte));
-
-            DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!");
-
-            if (source.Length == 0)
-            {
-                return;
-            }
-
-            ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
-            ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest));
-            int n = source.Length / 8;
-
-            Vector<float> magick = new Vector<float>(32768.0f);
-            Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f);
-
-            // need to copy to a temporary struct, because
-            // SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
-            // does not work. TODO: This might be a CoreClr bug, need to ask/report
-            var temp = default(Octet.OfUInt32);
-            ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp);
-
-            for (int i = 0; i < n; i++)
-            {
-                // union { float f; uint32_t i; } u;
-                // u.f = 32768.0f + x * (255.0f / 256.0f);
-                // return (uint8_t)u.i;
-                Vector<float> x = Unsafe.Add(ref srcBase, i);
-                x = (x * scale) + magick;
-                tempRef = x;
-
-                ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
-                d.LoadFrom(ref temp);
-            }
-        }
-
-        /// <summary>
-        /// Same as <see cref="BulkConvertNormalizedFloatToByte"/> but clamps overflown values before conversion.
-        /// </summary>
-        internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
-        {
-            GuardAvx2(nameof(BulkConvertNormalizedFloatToByte));
-
-            DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!");
-
-            if (source.Length == 0)
-            {
-                return;
-            }
-
-            ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
-            ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest));
-            int n = source.Length / 8;
-
-            Vector<float> magick = new Vector<float>(32768.0f);
-            Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f);
-
-            // need to copy to a temporary struct, because
-            // SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
-            // does not work. TODO: This might be a CoreClr bug, need to ask/report
-            var temp = default(Octet.OfUInt32);
-            ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp);
-
-            for (int i = 0; i < n; i++)
-            {
-                // union { float f; uint32_t i; } u;
-                // u.f = 32768.0f + x * (255.0f / 256.0f);
-                // return (uint8_t)u.i;
-                Vector<float> x = Unsafe.Add(ref srcBase, i);
-                x = Vector.Max(x, Vector<float>.Zero);
-                x = Vector.Min(x, Vector<float>.One);
-
-                x = (x * scale) + magick;
-                tempRef = x;
-
-                ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
-                d.LoadFrom(ref temp);
-            }
-        }
-
-        // TODO: Replace these with T4-d library level tuples!
-        internal static class Octet
-        {
-            [StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))]
-            public struct OfUInt32
-            {
-                [FieldOffset(0 * sizeof(uint))]
-                public uint V0;
-
-                [FieldOffset(1 * sizeof(uint))]
-                public uint V1;
-
-                [FieldOffset(2 * sizeof(uint))]
-                public uint V2;
-
-                [FieldOffset(3 * sizeof(uint))]
-                public uint V3;
-
-                [FieldOffset(4 * sizeof(uint))]
-                public uint V4;
-
-                [FieldOffset(5 * sizeof(uint))]
-                public uint V5;
-
-                [FieldOffset(6 * sizeof(uint))]
-                public uint V6;
-
-                [FieldOffset(7 * sizeof(uint))]
-                public uint V7;
-
-                public override string ToString()
-                {
-                    return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
-                }
-            }
-
-            [StructLayout(LayoutKind.Explicit, Size = 8)]
-            public struct OfByte
-            {
-                [FieldOffset(0)]
-                public byte V0;
-
-                [FieldOffset(1)]
-                public byte V1;
-
-                [FieldOffset(2)]
-                public byte V2;
-
-                [FieldOffset(3)]
-                public byte V3;
-
-                [FieldOffset(4)]
-                public byte V4;
-
-                [FieldOffset(5)]
-                public byte V5;
-
-                [FieldOffset(6)]
-                public byte V6;
-
-                [FieldOffset(7)]
-                public byte V7;
-
-                public override string ToString()
-                {
-                    return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
-                }
-
-                public void LoadFrom(ref OfUInt32 i)
-                {
-                    this.V0 = (byte)i.V0;
-                    this.V1 = (byte)i.V1;
-                    this.V2 = (byte)i.V2;
-                    this.V3 = (byte)i.V3;
-                    this.V4 = (byte)i.V4;
-                    this.V5 = (byte)i.V5;
-                    this.V6 = (byte)i.V6;
-                    this.V7 = (byte)i.V7;
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/src/ImageSharp/Common/Helpers/ImageMaths.cs b/src/ImageSharp/Common/Helpers/ImageMaths.cs
index 35769d96a7..02a2e9ee55 100644
--- a/src/ImageSharp/Common/Helpers/ImageMaths.cs
+++ b/src/ImageSharp/Common/Helpers/ImageMaths.cs
@@ -39,6 +39,28 @@ public static int LeastCommonMultiple(int a, int b)
             return (a / GreatestCommonDivisor(a, b)) * b;
         }
 
+        /// <summary>
+        /// Calculates <paramref name="x"/> % 4
+        /// </summary>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static int Modulo4(int x) => x & 3;
+
+        /// <summary>
+        /// Calculates <paramref name="x"/> % 8
+        /// </summary>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static int Modulo8(int x) => x & 7;
+
+        /// <summary>
+        /// Fast (x mod m) calculator, with the restriction that
+        /// <paramref name="m"/> should be power of 2.
+        /// </summary>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static int ModuloP2(int x, int m)
+        {
+            return x & (m - 1);
+        }
+
         /// <summary>
         /// Returns the absolute value of a 32-bit signed integer. Uses bit shifting to speed up the operation.
         /// </summary>
@@ -46,7 +68,7 @@ public static int LeastCommonMultiple(int a, int b)
         /// A number that is greater than <see cref="int.MinValue"/>, but less than or equal to <see cref="int.MaxValue"/>
         /// </param>
         /// <returns>The <see cref="int"/></returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
         public static int FastAbs(int x)
         {
             int y = x >> 31;
@@ -58,7 +80,7 @@ public static int FastAbs(int x)
         /// </summary>
         /// <param name="x">A single-precision floating-point number</param>
         /// <returns>The number <paramref name="x" /> raised to the power of 2.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
         public static float Pow2(float x) => x * x;
 
         /// <summary>
@@ -66,7 +88,7 @@ public static int FastAbs(int x)
         /// </summary>
         /// <param name="x">A single-precision floating-point number</param>
         /// <returns>The number <paramref name="x" /> raised to the power of 3.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
         public static float Pow3(float x) => x * x * x;
 
         /// <summary>
@@ -77,7 +99,7 @@ public static int FastAbs(int x)
         /// <returns>
         /// The <see cref="int"/>
         /// </returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
         public static int GetBitsNeededForColorDepth(int colors) => Math.Max(1, (int)Math.Ceiling(Math.Log(colors, 2)));
 
         /// <summary>
@@ -85,7 +107,7 @@ public static int FastAbs(int x)
         /// </summary>
         /// <param name="bitDepth">The bit depth.</param>
         /// <returns>The <see cref="int"/></returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
         public static int GetColorCountForBitDepth(int bitDepth) => 1 << bitDepth;
 
         /// <summary>
@@ -94,7 +116,7 @@ public static int FastAbs(int x)
         /// <param name="x">The x provided to G(x).</param>
         /// <param name="sigma">The spread of the blur.</param>
         /// <returns>The Gaussian G(x)</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
         public static float Gaussian(float x, float sigma)
         {
             const float Numerator = 1.0f;
@@ -117,7 +139,7 @@ public static float Gaussian(float x, float sigma)
         /// <returns>
         /// The sine cardinal of <paramref name="f" />.
         /// </returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
         public static float SinC(float f)
         {
             if (MathF.Abs(f) > Constants.Epsilon)
@@ -140,7 +162,7 @@ public static float SinC(float f)
         /// <returns>
         /// The <see cref="float"/>.
         /// </returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
         public static float GetBcValue(float x, float b, float c)
         {
             if (x < 0F)
@@ -176,7 +198,7 @@ public static float GetBcValue(float x, float b, float c)
         /// <returns>
         /// The bounding <see cref="Rectangle"/>.
         /// </returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
         public static Rectangle GetBoundingRectangle(Point topLeft, Point bottomRight) => new Rectangle(topLeft.X, topLeft.Y, bottomRight.X - topLeft.X, bottomRight.Y - topLeft.Y);
 
         /// <summary>
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
new file mode 100644
index 0000000000..0f1ce2ab6a
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
@@ -0,0 +1,215 @@
+﻿// Copyright (c) Six Labors and contributors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using SixLabors.ImageSharp.Tuples;
+
+// ReSharper disable MemberHidesStaticFromOuterClass
+namespace SixLabors.ImageSharp
+{
+    internal static partial class SimdUtils
+    {
+        /// <summary>
+        /// Implementation with 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen etc.)
+        /// </summary>
+        public static class BasicIntrinsics256
+        {
+            public static bool IsAvailable { get; } = IsAvx2CompatibleArchitecture;
+
+            /// <summary>
+            /// <see cref="BulkConvertByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
+            /// </summary>
+            [MethodImpl(InliningOptions.ShortMethod)]
+            internal static void BulkConvertByteToNormalizedFloatReduce(
+                ref ReadOnlySpan<byte> source,
+                ref Span<float> dest)
+            {
+                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+
+                if (!IsAvailable)
+                {
+                    return;
+                }
+
+                int remainder = ImageMaths.Modulo8(source.Length);
+                int adjustedCount = source.Length - remainder;
+
+                if (adjustedCount > 0)
+                {
+                    BulkConvertByteToNormalizedFloat(
+                        source.Slice(0, adjustedCount),
+                        dest.Slice(0, adjustedCount));
+
+                    source = source.Slice(adjustedCount);
+                    dest = dest.Slice(adjustedCount);
+                }
+            }
+
+            /// <summary>
+            /// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
+            /// </summary>
+            [MethodImpl(InliningOptions.ShortMethod)]
+            internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
+                ref ReadOnlySpan<float> source,
+                ref Span<byte> dest)
+            {
+                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+
+                if (!IsAvailable)
+                {
+                    return;
+                }
+
+                int remainder = ImageMaths.Modulo8(source.Length);
+                int adjustedCount = source.Length - remainder;
+
+                if (adjustedCount > 0)
+                {
+                    BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
+
+                    source = source.Slice(adjustedCount);
+                    dest = dest.Slice(adjustedCount);
+                }
+            }
+
+            /// <summary>
+            /// SIMD optimized implementation for <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/>.
+            /// Works only with span Length divisible by 8.
+            /// Implementation adapted from:
+            /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions
+            /// http://stackoverflow.com/a/536278
+            /// </summary>
+            internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
+            {
+                VerifyIsAvx2Compatible(nameof(BulkConvertByteToNormalizedFloat));
+                VerifySpanInput(source, dest, 8);
+
+                var bVec = new Vector<float>(256.0f / 255.0f);
+                var magicFloat = new Vector<float>(32768.0f);
+                var magicInt = new Vector<uint>(1191182336); // reinterpreded value of 32768.0f
+                var mask = new Vector<uint>(255);
+
+                ref Octet.OfByte sourceBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(source));
+                ref Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As<float, Octet.OfUInt32>(ref MemoryMarshal.GetReference(dest));
+
+                ref Vector<float> destBaseAsFloat = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref destBaseAsWideOctet);
+
+                int n = dest.Length / 8;
+
+                for (int i = 0; i < n; i++)
+                {
+                    ref Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i);
+                    ref Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i);
+                    d.LoadFrom(ref s);
+                }
+
+                for (int i = 0; i < n; i++)
+                {
+                    ref Vector<float> df = ref Unsafe.Add(ref destBaseAsFloat, i);
+
+                    var vi = Vector.AsVectorUInt32(df);
+                    vi &= mask;
+                    vi |= magicInt;
+
+                    var vf = Vector.AsVectorSingle(vi);
+                    vf = (vf - magicFloat) * bVec;
+
+                    df = vf;
+                }
+            }
+
+            /// <summary>
+            /// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/> which is faster on older runtimes.
+            /// </summary>
+            internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
+            {
+                VerifyIsAvx2Compatible(nameof(BulkConvertNormalizedFloatToByteClampOverflows));
+                VerifySpanInput(source, dest, 8);
+
+                if (source.Length == 0)
+                {
+                    return;
+                }
+
+                ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
+                ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest));
+                int n = source.Length / 8;
+
+                Vector<float> magick = new Vector<float>(32768.0f);
+                Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f);
+
+                // need to copy to a temporary struct, because
+                // SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
+                // does not work. TODO: This might be a CoreClr bug, need to ask/report
+                var temp = default(Octet.OfUInt32);
+                ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp);
+
+                for (int i = 0; i < n; i++)
+                {
+                    // union { float f; uint32_t i; } u;
+                    // u.f = 32768.0f + x * (255.0f / 256.0f);
+                    // return (uint8_t)u.i;
+                    Vector<float> x = Unsafe.Add(ref srcBase, i);
+                    x = Vector.Max(x, Vector<float>.Zero);
+                    x = Vector.Min(x, Vector<float>.One);
+
+                    x = (x * scale) + magick;
+                    tempRef = x;
+
+                    ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
+                    d.LoadFrom(ref temp);
+                }
+            }
+
+            /// <summary>
+            /// Convert all <see cref="float"/> values normalized into [0..1] from 'source'
+            /// into 'dest' buffer of <see cref="byte"/>. The values are scaled up into [0-255] and rounded.
+            /// This implementation is SIMD optimized and works only when span Length is divisible by 8.
+            /// Based on:
+            /// <see>
+            ///     <cref>http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions</cref>
+            /// </see>
+            /// </summary>
+            internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan<float> source, Span<byte> dest)
+            {
+                VerifyIsAvx2Compatible(nameof(BulkConvertNormalizedFloatToByte));
+                VerifySpanInput(source, dest, 8);
+
+                if (source.Length == 0)
+                {
+                    return;
+                }
+
+                ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
+                ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest));
+                int n = source.Length / 8;
+
+                Vector<float> magick = new Vector<float>(32768.0f);
+                Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f);
+
+                // need to copy to a temporary struct, because
+                // SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
+                // does not work. TODO: This might be a CoreClr bug, need to ask/report
+                var temp = default(Octet.OfUInt32);
+                ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp);
+
+                for (int i = 0; i < n; i++)
+                {
+                    // union { float f; uint32_t i; } u;
+                    // u.f = 32768.0f + x * (255.0f / 256.0f);
+                    // return (uint8_t)u.i;
+                    Vector<float> x = Unsafe.Add(ref srcBase, i);
+                    x = (x * scale) + magick;
+                    tempRef = x;
+
+                    ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
+                    d.LoadFrom(ref temp);
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
new file mode 100644
index 0000000000..e0d6187dca
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
@@ -0,0 +1,178 @@
+﻿using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// ReSharper disable MemberHidesStaticFromOuterClass
+namespace SixLabors.ImageSharp
+{
+    internal static partial class SimdUtils
+    {
+        /// <summary>
+        /// Implementation methods based on newer <see cref="Vector{T}"/> API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*).
+        /// Only accelerated only on RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+)
+        /// See:
+        /// https://github.com/dotnet/coreclr/pull/10662
+        /// API Proposal:
+        /// https://github.com/dotnet/corefx/issues/15957
+        /// </summary>
+        public static class ExtendedIntrinsics
+        {
+            public static bool IsAvailable { get; } =
+#if NETCOREAPP2_1
+                // TODO: Also available in .NET 4.7.2, we need to add a build target!
+                Vector.IsHardwareAccelerated;
+#else
+                false;
+#endif
+
+            /// <summary>
+            /// <see cref="BulkConvertByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
+            /// </summary>
+            [MethodImpl(InliningOptions.ShortMethod)]
+            internal static void BulkConvertByteToNormalizedFloatReduce(
+                ref ReadOnlySpan<byte> source,
+                ref Span<float> dest)
+            {
+                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+
+                if (!IsAvailable)
+                {
+                    return;
+                }
+
+                int remainder = ImageMaths.ModuloP2(source.Length, Vector<byte>.Count);
+                int adjustedCount = source.Length - remainder;
+
+                if (adjustedCount > 0)
+                {
+                    BulkConvertByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
+
+                    source = source.Slice(adjustedCount);
+                    dest = dest.Slice(adjustedCount);
+                }
+            }
+
+            /// <summary>
+            /// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
+            /// </summary>
+            [MethodImpl(InliningOptions.ShortMethod)]
+            internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
+                ref ReadOnlySpan<float> source,
+                ref Span<byte> dest)
+            {
+                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+
+                if (!IsAvailable)
+                {
+                    return;
+                }
+
+                int remainder = ImageMaths.ModuloP2(source.Length, Vector<byte>.Count);
+                int adjustedCount = source.Length - remainder;
+
+                if (adjustedCount > 0)
+                {
+                    BulkConvertNormalizedFloatToByteClampOverflows(
+                        source.Slice(0, adjustedCount),
+                        dest.Slice(0, adjustedCount));
+
+                    source = source.Slice(adjustedCount);
+                    dest = dest.Slice(adjustedCount);
+                }
+            }
+
+            /// <summary>
+            /// Implementation <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
+            /// </summary>
+            internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
+            {
+                VerifySpanInput(source, dest, Vector<byte>.Count);
+
+                int n = dest.Length / Vector<byte>.Count;
+
+                ref Vector<byte> sourceBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference(source));
+                ref Vector<float> destBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(dest));
+
+                for (int i = 0; i < n; i++)
+                {
+                    Vector<byte> b = Unsafe.Add(ref sourceBase, i);
+
+                    Vector.Widen(b, out Vector<ushort> s0, out Vector<ushort> s1);
+                    Vector.Widen(s0, out Vector<uint> w0, out Vector<uint> w1);
+                    Vector.Widen(s1, out Vector<uint> w2, out Vector<uint> w3);
+
+                    Vector<float> f0 = ConvertToSingle(w0);
+                    Vector<float> f1 = ConvertToSingle(w1);
+                    Vector<float> f2 = ConvertToSingle(w2);
+                    Vector<float> f3 = ConvertToSingle(w3);
+
+                    ref Vector<float> d = ref Unsafe.Add(ref destBase, i * 4);
+                    d = f0;
+                    Unsafe.Add(ref d, 1) = f1;
+                    Unsafe.Add(ref d, 2) = f2;
+                    Unsafe.Add(ref d, 3) = f3;
+                }
+            }
+
+            /// <summary>
+            /// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/>, which is faster on new .NET runtime.
+            /// </summary>
+            internal static void BulkConvertNormalizedFloatToByteClampOverflows(
+                ReadOnlySpan<float> source,
+                Span<byte> dest)
+            {
+                VerifySpanInput(source, dest, Vector<byte>.Count);
+
+                int n = dest.Length / Vector<byte>.Count;
+
+                ref Vector<float> sourceBase =
+                    ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
+                ref Vector<byte> destBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference(dest));
+
+                for (int i = 0; i < n; i++)
+                {
+                    ref Vector<float> s = ref Unsafe.Add(ref sourceBase, i * 4);
+
+                    Vector<float> f0 = s;
+                    Vector<float> f1 = Unsafe.Add(ref s, 1);
+                    Vector<float> f2 = Unsafe.Add(ref s, 2);
+                    Vector<float> f3 = Unsafe.Add(ref s, 3);
+
+                    Vector<uint> w0 = ConvertToUInt32(f0);
+                    Vector<uint> w1 = ConvertToUInt32(f1);
+                    Vector<uint> w2 = ConvertToUInt32(f2);
+                    Vector<uint> w3 = ConvertToUInt32(f3);
+
+                    Vector<ushort> u0 = Vector.Narrow(w0, w1);
+                    Vector<ushort> u1 = Vector.Narrow(w2, w3);
+
+                    Vector<byte> b = Vector.Narrow(u0, u1);
+
+                    Unsafe.Add(ref destBase, i) = b;
+                }
+            }
+
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            private static Vector<uint> ConvertToUInt32(Vector<float> vf)
+            {
+                Vector<float> maxBytes = new Vector<float>(255f);
+                vf *= maxBytes;
+                vf += new Vector<float>(0.5f);
+                vf = Vector.Min(Vector.Max(vf, Vector<float>.Zero), maxBytes);
+                Vector<int> vi = Vector.ConvertToInt32(vf);
+                return Vector.AsVectorUInt32(vi);
+            }
+
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            private static Vector<float> ConvertToSingle(Vector<uint> u)
+            {
+                Vector<int> vi = Vector.AsVectorInt32(u);
+                Vector<float> v = Vector.ConvertToSingle(vi);
+                v *= new Vector<float>(1f / 255f);
+                return v;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs
new file mode 100644
index 0000000000..565ea08f5d
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs
@@ -0,0 +1,151 @@
+﻿// Copyright (c) Six Labors and contributors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// ReSharper disable MemberHidesStaticFromOuterClass
+namespace SixLabors.ImageSharp
+{
+    internal static partial class SimdUtils
+    {
+        /// <summary>
+        /// Fallback implementation based on <see cref="Vector4"/> (128bit).
+        /// For <see cref="Vector4"/>, efficient software fallback implementations are present,
+        /// and we hope that even mono's JIT is able to emit SIMD instructions for that type :P
+        /// </summary>
+        public static class FallbackIntrinsics128
+        {
+            /// <summary>
+            /// <see cref="BulkConvertByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
+            /// </summary>
+            [MethodImpl(InliningOptions.ShortMethod)]
+            internal static void BulkConvertByteToNormalizedFloatReduce(
+                ref ReadOnlySpan<byte> source,
+                ref Span<float> dest)
+            {
+                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+
+                int remainder = ImageMaths.Modulo4(source.Length);
+                int adjustedCount = source.Length - remainder;
+
+                if (adjustedCount > 0)
+                {
+                    BulkConvertByteToNormalizedFloat(
+                        source.Slice(0, adjustedCount),
+                        dest.Slice(0, adjustedCount));
+
+                    source = source.Slice(adjustedCount);
+                    dest = dest.Slice(adjustedCount);
+                }
+            }
+
+            /// <summary>
+            /// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
+            /// </summary>
+            [MethodImpl(InliningOptions.ShortMethod)]
+            internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
+                ref ReadOnlySpan<float> source,
+                ref Span<byte> dest)
+            {
+                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+
+                int remainder = ImageMaths.Modulo4(source.Length);
+                int adjustedCount = source.Length - remainder;
+
+                if (adjustedCount > 0)
+                {
+                    BulkConvertNormalizedFloatToByteClampOverflows(
+                        source.Slice(0, adjustedCount),
+                        dest.Slice(0, adjustedCount));
+
+                    source = source.Slice(adjustedCount);
+                    dest = dest.Slice(adjustedCount);
+                }
+            }
+
+            /// <summary>
+            /// Implementation of <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/> using <see cref="Vector4"/>.
+            /// </summary>
+            [MethodImpl(InliningOptions.ColdPath)]
+            internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
+            {
+                VerifySpanInput(source, dest, 4);
+
+                int count = dest.Length / 4;
+                if (count == 0)
+                {
+                    return;
+                }
+
+                ref ByteVector4 sBase = ref Unsafe.As<byte, ByteVector4>(ref MemoryMarshal.GetReference(source));
+                ref Vector4 dBase = ref Unsafe.As<float, Vector4>(ref MemoryMarshal.GetReference(dest));
+
+                const float Scale = 1f / 255f;
+                Vector4 d = default;
+
+                for (int i = 0; i < count; i++)
+                {
+                    ref ByteVector4 s = ref Unsafe.Add(ref sBase, i);
+                    d.X = s.X;
+                    d.Y = s.Y;
+                    d.Z = s.Z;
+                    d.W = s.W;
+                    d *= Scale;
+                    Unsafe.Add(ref dBase, i) = d;
+                }
+            }
+
+            /// <summary>
+            /// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/> using <see cref="Vector4"/>.
+            /// </summary>
+            [MethodImpl(InliningOptions.ColdPath)]
+            internal static void BulkConvertNormalizedFloatToByteClampOverflows(
+                ReadOnlySpan<float> source,
+                Span<byte> dest)
+            {
+                VerifySpanInput(source, dest, 4);
+
+                int count = source.Length / 4;
+                if (count == 0)
+                {
+                    return;
+                }
+
+                ref Vector4 sBase = ref Unsafe.As<float, Vector4>(ref MemoryMarshal.GetReference(source));
+                ref ByteVector4 dBase = ref Unsafe.As<byte, ByteVector4>(ref MemoryMarshal.GetReference(dest));
+
+                var half = new Vector4(0.5f);
+                var maxBytes = new Vector4(255f);
+
+                for (int i = 0; i < count; i++)
+                {
+                    Vector4 s = Unsafe.Add(ref sBase, i);
+                    s *= maxBytes;
+                    s += half;
+
+                    // I'm not sure if Vector4.Clamp() is properly implemented with intrinsics.
+                    s = Vector4.Max(Vector4.Zero, s);
+                    s = Vector4.Min(maxBytes, s);
+
+                    ref ByteVector4 d = ref Unsafe.Add(ref dBase, i);
+                    d.X = (byte)s.X;
+                    d.Y = (byte)s.Y;
+                    d.Z = (byte)s.Z;
+                    d.W = (byte)s.W;
+                }
+            }
+
+            [StructLayout(LayoutKind.Sequential)]
+            private struct ByteVector4
+            {
+                public byte X;
+                public byte Y;
+                public byte Z;
+                public byte W;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs
new file mode 100644
index 0000000000..737e620061
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs
@@ -0,0 +1,185 @@
+﻿// Copyright (c) Six Labors and contributors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+using SixLabors.ImageSharp.PixelFormats;
+using SixLabors.ImageSharp.Tuples;
+
+namespace SixLabors.ImageSharp
+{
+    /// <summary>
+    /// Various extension and utility methods for <see cref="Vector4"/> and <see cref="Vector{T}"/> utilizing SIMD capabilities
+    /// </summary>
+    internal static partial class SimdUtils
+    {
+        /// <summary>
+        /// Gets a value indicating whether the code is being executed on AVX2 CPU where both float and integer registers are of size 256 byte.
+        /// </summary>
+        public static bool IsAvx2CompatibleArchitecture { get; } =
+            Vector.IsHardwareAccelerated && Vector<float>.Count == 8 && Vector<int>.Count == 8;
+
+        /// <summary>
+        /// Transform all scalars in 'v' in a way that converting them to <see cref="int"/> would have rounding semantics.
+        /// </summary>
+        /// <param name="v">The vector</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector4 PseudoRound(this Vector4 v)
+        {
+            var sign = Vector4.Clamp(v, new Vector4(-1), new Vector4(1));
+
+            return v + (sign * 0.5f);
+        }
+
+        /// <summary>
+        /// Rounds all values in 'v' to the nearest integer following <see cref="MidpointRounding.ToEven"/> semantics.
+        /// Source:
+        /// <see>
+        ///     <cref>https://github.com/g-truc/glm/blob/master/glm/simd/common.h#L110</cref>
+        /// </see>
+        /// </summary>
+        /// <param name="v">The vector</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector<float> FastRound(this Vector<float> v)
+        {
+            Vector<int> magic0 = new Vector<int>(int.MinValue); // 0x80000000
+            Vector<float> sgn0 = Vector.AsVectorSingle(magic0);
+            Vector<float> and0 = Vector.BitwiseAnd(sgn0, v);
+            Vector<float> or0 = Vector.BitwiseOr(and0, new Vector<float>(8388608.0f));
+            Vector<float> add0 = Vector.Add(v, or0);
+            Vector<float> sub0 = Vector.Subtract(add0, or0);
+            return sub0;
+        }
+
+        /// <summary>
+        /// Converts all input <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1].
+        /// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
+        /// but there are no restrictions on the span's length.
+        /// </summary>
+        /// <param name="source">The source span of bytes</param>
+        /// <param name="dest">The destination span of floats</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
+        {
+            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+
+#if NETCOREAPP2_1
+            ExtendedIntrinsics.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
+#else
+            BasicIntrinsics256.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
+#endif
+            FallbackIntrinsics128.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
+
+            // Deal with the remainder:
+            if (source.Length > 0)
+            {
+                ConverByteToNormalizedFloatRemainder(source, dest);
+            }
+        }
+
+        /// <summary>
+        /// Convert all <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/>.
+        /// The values are scaled up into [0-255] and rounded, overflows are clamped.
+        /// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
+        /// but there are no restrictions on the span's length.
+        /// </summary>
+        /// <param name="source">The source span of floats</param>
+        /// <param name="dest">The destination span of bytes</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
+        {
+            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+
+#if NETCOREAPP2_1
+            ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
+#else
+            BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
+#endif
+            FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
+
+            // Deal with the remainder:
+            if (source.Length > 0)
+            {
+                ConvertNormalizedFloatToByteRemainder(source, dest);
+            }
+        }
+
+        [MethodImpl(InliningOptions.ColdPath)]
+        private static void ConverByteToNormalizedFloatRemainder(ReadOnlySpan<byte> source, Span<float> dest)
+        {
+            ref byte sBase = ref MemoryMarshal.GetReference(source);
+            ref float dBase = ref MemoryMarshal.GetReference(dest);
+
+            // There are at most 3 elements at this point, having a for loop is overkill.
+            // Let's minimize the no. of instructions!
+            switch (source.Length)
+            {
+                case 3:
+                    Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2) / 255f;
+                    goto case 2;
+                case 2:
+                    Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1) / 255f;
+                    goto case 1;
+                case 1:
+                    dBase = sBase / 255f;
+                    break;
+            }
+        }
+
+        [MethodImpl(InliningOptions.ColdPath)]
+        private static void ConvertNormalizedFloatToByteRemainder(ReadOnlySpan<float> source, Span<byte> dest)
+        {
+            ref float sBase = ref MemoryMarshal.GetReference(source);
+            ref byte dBase = ref MemoryMarshal.GetReference(dest);
+
+            switch (source.Length)
+            {
+                case 3:
+                    Unsafe.Add(ref dBase, 2) = ConvertToByte(Unsafe.Add(ref sBase, 2));
+                    goto case 2;
+                case 2:
+                    Unsafe.Add(ref dBase, 1) = ConvertToByte(Unsafe.Add(ref sBase, 1));
+                    goto case 1;
+                case 1:
+                    dBase = ConvertToByte(sBase);
+                    break;
+            }
+        }
+
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private static byte ConvertToByte(float f) => (byte)ComparableExtensions.Clamp((f * 255f) + 0.5f, 0, 255f);
+
+        [Conditional("DEBUG")]
+        private static void VerifyIsAvx2Compatible(string operation)
+        {
+            if (!IsAvx2CompatibleArchitecture)
+            {
+                throw new NotSupportedException($"{operation} is supported only on AVX2 CPU!");
+            }
+        }
+
+        [Conditional("DEBUG")]
+        private static void VerifySpanInput(ReadOnlySpan<byte> source, Span<float> dest, int shouldBeDivisibleBy)
+        {
+            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+            DebugGuard.IsTrue(
+                ImageMaths.ModuloP2(dest.Length, shouldBeDivisibleBy) == 0,
+                nameof(source),
+                $"length should be divisable by {shouldBeDivisibleBy}!");
+        }
+
+        [Conditional("DEBUG")]
+        private static void VerifySpanInput(ReadOnlySpan<float> source, Span<byte> dest, int shouldBeDivisibleBy)
+        {
+            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+            DebugGuard.IsTrue(
+                ImageMaths.ModuloP2(dest.Length, shouldBeDivisibleBy) == 0,
+                nameof(source),
+                $"length should be divisable by {shouldBeDivisibleBy}!");
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/ImageSharp/Common/Tuples/Octet.cs b/src/ImageSharp/Common/Tuples/Octet.cs
new file mode 100644
index 0000000000..539b74e324
--- /dev/null
+++ b/src/ImageSharp/Common/Tuples/Octet.cs
@@ -0,0 +1,109 @@
+﻿using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp.Tuples
+{
+    /// <summary>
+    /// Contains 8 element value tuples of various types.
+    /// </summary>
+    internal static class Octet
+    {
+        /// <summary>
+        /// Value tuple of <see cref="uint"/>-s
+        /// </summary>
+        [StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))]
+        public struct OfUInt32
+        {
+            [FieldOffset(0 * sizeof(uint))]
+            public uint V0;
+
+            [FieldOffset(1 * sizeof(uint))]
+            public uint V1;
+
+            [FieldOffset(2 * sizeof(uint))]
+            public uint V2;
+
+            [FieldOffset(3 * sizeof(uint))]
+            public uint V3;
+
+            [FieldOffset(4 * sizeof(uint))]
+            public uint V4;
+
+            [FieldOffset(5 * sizeof(uint))]
+            public uint V5;
+
+            [FieldOffset(6 * sizeof(uint))]
+            public uint V6;
+
+            [FieldOffset(7 * sizeof(uint))]
+            public uint V7;
+
+            public override string ToString()
+            {
+                return $"{nameof(Octet)}.{nameof(OfUInt32)}({this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7})";
+            }
+
+            [MethodImpl(InliningOptions.ShortMethod)]
+            public void LoadFrom(ref OfByte src)
+            {
+                this.V0 = src.V0;
+                this.V1 = src.V1;
+                this.V2 = src.V2;
+                this.V3 = src.V3;
+                this.V4 = src.V4;
+                this.V5 = src.V5;
+                this.V6 = src.V6;
+                this.V7 = src.V7;
+            }
+        }
+
+        /// <summary>
+        /// Value tuple of <see cref="byte"/>-s
+        /// </summary>
+        [StructLayout(LayoutKind.Explicit, Size = 8)]
+        public struct OfByte
+        {
+            [FieldOffset(0)]
+            public byte V0;
+
+            [FieldOffset(1)]
+            public byte V1;
+
+            [FieldOffset(2)]
+            public byte V2;
+
+            [FieldOffset(3)]
+            public byte V3;
+
+            [FieldOffset(4)]
+            public byte V4;
+
+            [FieldOffset(5)]
+            public byte V5;
+
+            [FieldOffset(6)]
+            public byte V6;
+
+            [FieldOffset(7)]
+            public byte V7;
+
+            public override string ToString()
+            {
+                return $"{nameof(Octet)}.{nameof(OfByte)}({this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7})";
+            }
+
+            [MethodImpl(InliningOptions.ShortMethod)]
+            public void LoadFrom(ref OfUInt32 src)
+            {
+                this.V0 = (byte)src.V0;
+                this.V1 = (byte)src.V1;
+                this.V2 = (byte)src.V2;
+                this.V3 = (byte)src.V3;
+                this.V4 = (byte)src.V4;
+                this.V5 = (byte)src.V5;
+                this.V6 = (byte)src.V6;
+                this.V7 = (byte)src.V7;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/ImageSharp/Common/Tuples/Vector4Pair.cs b/src/ImageSharp/Common/Tuples/Vector4Pair.cs
index 309d5e2e56..cae283d628 100644
--- a/src/ImageSharp/Common/Tuples/Vector4Pair.cs
+++ b/src/ImageSharp/Common/Tuples/Vector4Pair.cs
@@ -2,11 +2,12 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 
-namespace SixLabors.ImageSharp.Common.Tuples
+namespace SixLabors.ImageSharp.Tuples
 {
     /// <summary>
     /// Its faster to process multiple Vector4-s together, so let's pair them!
     /// On AVX2 this pair should be convertible to <see cref="Vector{T}"/> of <see cref="float"/>!
+    /// TODO: Investigate defining this as union with an Octet.OfSingle type.
     /// </summary>
     [StructLayout(LayoutKind.Sequential)]
     internal struct Vector4Pair
@@ -15,8 +16,6 @@ internal struct Vector4Pair
 
         public Vector4 B;
 
-        private static readonly Vector4 Scale = new Vector4(1 / 255f);
-
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public void MultiplyInplace(float value)
         {
@@ -52,8 +51,9 @@ internal void RoundAndDownscalePreAvx2()
             b = b.FastRound();
 
             // Downscale by 1/255
-            this.A *= Scale;
-            this.B *= Scale;
+            var scale = new Vector4(1 / 255f);
+            this.A *= scale;
+            this.B *= scale;
         }
 
         /// <summary>
@@ -74,7 +74,7 @@ internal void RoundAndDownscaleAvx2()
 
         public override string ToString()
         {
-            return $"{this.A}, {this.B}";
+            return $"{nameof(Vector4Pair)}({this.A}, {this.B})";
         }
     }
 }
\ No newline at end of file
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs
index 4b2626c582..1dc72aaf5b 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs
@@ -6,7 +6,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 
-using SixLabors.ImageSharp.Common.Tuples;
+using SixLabors.ImageSharp.Tuples;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
 {
@@ -109,7 +109,7 @@ internal static void ConvertCore(in ComponentValues values, Span<Vector4> result
 
                     // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
                     ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i);
-                    destination.Collect(ref r, ref g, ref b);
+                    destination.Pack(ref r, ref g, ref b);
                 }
             }
         }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs
index ab4947e65c..46644258b1 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs
@@ -6,7 +6,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 
-using SixLabors.ImageSharp.Common.Tuples;
+using SixLabors.ImageSharp.Tuples;
 
 // ReSharper disable ImpureMethodCallOnReadonlyValueField
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
@@ -102,7 +102,7 @@ internal static void ConvertCore(in ComponentValues values, Span<Vector4> result
 
                     // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
                     ref Vector4Octet destination = ref Unsafe.Add(ref resultBase, i);
-                    destination.Collect(ref rr, ref gg, ref bb);
+                    destination.Pack(ref rr, ref gg, ref bb);
                 }
             }
         }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs
index 60abb7fb2c..456636dc39 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs
@@ -6,8 +6,8 @@
 using System.Linq;
 using System.Numerics;
 
-using SixLabors.ImageSharp.Common.Tuples;
 using SixLabors.ImageSharp.Memory;
+using SixLabors.ImageSharp.Tuples;
 using SixLabors.Memory;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
@@ -157,9 +157,9 @@ internal struct Vector4Octet
             public Vector4 V0, V1, V2, V3, V4, V5, V6, V7;
 
             /// <summary>
-            /// Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order.
+            /// Pack (r0,r1...r7) (g0,g1...g7) (b0,b1...b7) vector values as (r0,g0,b0,1), (r1,g1,b1,1) ...
             /// </summary>
-            public void Collect(ref Vector4Pair r, ref Vector4Pair g, ref Vector4Pair b)
+            public void Pack(ref Vector4Pair r, ref Vector4Pair g, ref Vector4Pair b)
             {
                 this.V0.X = r.A.X;
                 this.V0.Y = g.A.X;
diff --git a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs
index 2629ce3f79..bb42ec7e34 100644
--- a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs
+++ b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs
@@ -3,7 +3,6 @@
 
 using System;
 using System.Numerics;
-using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using SixLabors.Memory;
 
@@ -19,100 +18,18 @@ public partial struct Rgba32
         /// </summary>
         internal partial class PixelOperations : PixelOperations<Rgba32>
         {
-            /// <summary>
-            /// SIMD optimized bulk implementation of <see cref="IPixel.PackFromVector4(Vector4)"/>
-            /// that works only with `count` divisible by <see cref="Vector{UInt32}.Count"/>.
-            /// </summary>
-            /// <param name="sourceColors">The <see cref="Span{T}"/> to the source colors.</param>
-            /// <param name="destVectors">The <see cref="Span{T}"/> to the dstination vectors.</param>
-            /// <param name="count">The number of pixels to convert.</param>
-            /// <remarks>
-            /// Implementation adapted from:
-            /// <see>
-            ///     <cref>http://stackoverflow.com/a/5362789</cref>
-            /// </see>
-            /// TODO: We can replace this implementation in the future using new Vector API-s:
-            /// <see>
-            ///     <cref>https://github.com/dotnet/corefx/issues/15957</cref>
-            /// </see>
-            /// </remarks>
-            internal static void ToVector4SimdAligned(ReadOnlySpan<Rgba32> sourceColors, Span<Vector4> destVectors, int count)
-            {
-                if (!Vector.IsHardwareAccelerated)
-                {
-                    throw new InvalidOperationException(
-                        "Rgba32.PixelOperations.ToVector4SimdAligned() should not be called when Vector.IsHardwareAccelerated == false!");
-                }
-
-                DebugGuard.IsTrue(
-                    count % Vector<uint>.Count == 0,
-                    nameof(count),
-                    "Argument 'count' should divisible by Vector<uint>.Count!");
-
-                var bVec = new Vector<float>(256.0f / 255.0f);
-                var magicFloat = new Vector<float>(32768.0f);
-                var magicInt = new Vector<uint>(1191182336); // reinterpreded value of 32768.0f
-                var mask = new Vector<uint>(255);
-
-                int unpackedRawCount = count * 4;
-
-                ref uint sourceBase = ref Unsafe.As<Rgba32, uint>(ref MemoryMarshal.GetReference(sourceColors));
-                ref UnpackedRGBA destBaseAsUnpacked = ref Unsafe.As<Vector4, UnpackedRGBA>(ref MemoryMarshal.GetReference(destVectors));
-                ref Vector<uint> destBaseAsUInt = ref Unsafe.As<UnpackedRGBA, Vector<uint>>(ref destBaseAsUnpacked);
-                ref Vector<float> destBaseAsFloat = ref Unsafe.As<UnpackedRGBA, Vector<float>>(ref destBaseAsUnpacked);
-
-                for (int i = 0; i < count; i++)
-                {
-                    uint sVal = Unsafe.Add(ref sourceBase, i);
-                    ref UnpackedRGBA dst = ref Unsafe.Add(ref destBaseAsUnpacked, i);
-
-                    // This call is the bottleneck now:
-                    dst.Load(sVal);
-                }
-
-                int numOfVectors = unpackedRawCount / Vector<uint>.Count;
-
-                for (int i = 0; i < numOfVectors; i++)
-                {
-                    Vector<uint> vi = Unsafe.Add(ref destBaseAsUInt, i);
-
-                    vi &= mask;
-                    vi |= magicInt;
-
-                    var vf = Vector.AsVectorSingle(vi);
-                    vf = (vf - magicFloat) * bVec;
-
-                    Unsafe.Add(ref destBaseAsFloat, i) = vf;
-                }
-            }
-
             /// <inheritdoc />
             internal override void ToVector4(ReadOnlySpan<Rgba32> sourceColors, Span<Vector4> destinationVectors, int count)
             {
                 Guard.MustBeSizedAtLeast(sourceColors, count, nameof(sourceColors));
                 Guard.MustBeSizedAtLeast(destinationVectors, count, nameof(destinationVectors));
 
-                if (count < 256 || !Vector.IsHardwareAccelerated)
-                {
-                    // Doesn't worth to bother with SIMD:
-                    base.ToVector4(sourceColors, destinationVectors, count);
-                    return;
-                }
-
-                int remainder = count % Vector<uint>.Count;
-                int alignedCount = count - remainder;
+                sourceColors = sourceColors.Slice(0, count);
+                destinationVectors = destinationVectors.Slice(0, count);
 
-                if (alignedCount > 0)
-                {
-                    ToVector4SimdAligned(sourceColors, destinationVectors, alignedCount);
-                }
-
-                if (remainder > 0)
-                {
-                    sourceColors = sourceColors.Slice(alignedCount);
-                    destinationVectors = destinationVectors.Slice(alignedCount);
-                    base.ToVector4(sourceColors, destinationVectors, remainder);
-                }
+                SimdUtils.BulkConvertByteToNormalizedFloat(
+                    MemoryMarshal.Cast<Rgba32, byte>(sourceColors),
+                    MemoryMarshal.Cast<Vector4, float>(destinationVectors));
             }
 
             /// <inheritdoc />
@@ -120,29 +37,12 @@ internal override void PackFromVector4(ReadOnlySpan<Vector4> sourceVectors, Span
             {
                 GuardSpans(sourceVectors, nameof(sourceVectors), destinationColors, nameof(destinationColors), count);
 
-                if (!SimdUtils.IsAvx2CompatibleArchitecture)
-                {
-                    base.PackFromVector4(sourceVectors, destinationColors, count);
-                    return;
-                }
-
-                int remainder = count % 2;
-                int alignedCount = count - remainder;
+                sourceVectors = sourceVectors.Slice(0, count);
+                destinationColors = destinationColors.Slice(0, count);
 
-                if (alignedCount > 0)
-                {
-                    ReadOnlySpan<float> flatSrc = MemoryMarshal.Cast<Vector4, float>(sourceVectors.Slice(0, alignedCount));
-                    Span<byte> flatDest = MemoryMarshal.Cast<Rgba32, byte>(destinationColors);
-
-                    SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(flatSrc, flatDest);
-                }
-
-                if (remainder > 0)
-                {
-                    // actually: remainder == 1
-                    int lastIdx = count - 1;
-                    destinationColors[lastIdx].PackFromVector4(sourceVectors[lastIdx]);
-                }
+                SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(
+                    MemoryMarshal.Cast<Vector4, float>(sourceVectors),
+                    MemoryMarshal.Cast<Rgba32, byte>(destinationColors));
             }
 
             /// <inheritdoc />
@@ -172,30 +72,6 @@ internal override void ToRgba32(ReadOnlySpan<Rgba32> sourcePixels, Span<Rgba32>
 
                 sourcePixels.Slice(0, count).CopyTo(dest);
             }
-
-            /// <summary>
-            /// Value type to store <see cref="Rgba32"/>-s unpacked into multiple <see cref="uint"/>-s.
-            /// </summary>
-            [StructLayout(LayoutKind.Sequential)]
-            private struct UnpackedRGBA
-            {
-                private uint r;
-
-                private uint g;
-
-                private uint b;
-
-                private uint a;
-
-                [MethodImpl(MethodImplOptions.AggressiveInlining)]
-                public void Load(uint p)
-                {
-                    this.r = p;
-                    this.g = p >> GreenShift;
-                    this.b = p >> BlueShift;
-                    this.a = p >> AlphaShift;
-                }
-            }
         }
     }
 }
\ No newline at end of file
diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
index a5fa59ba07..eaa52a9750 100644
--- a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
@@ -3,6 +3,7 @@
 
 // ReSharper disable InconsistentNaming
 
+using System;
 using System.Buffers;
 using System.Numerics;
 using System.Runtime.CompilerServices;
@@ -19,11 +20,14 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
     public abstract class PackFromVector4<TPixel>
         where TPixel : struct, IPixel<TPixel>
     {
-        private IMemoryOwner<Vector4> source;
+        protected IMemoryOwner<Vector4> source;
 
-        private IMemoryOwner<TPixel> destination;
+        protected IMemoryOwner<TPixel> destination;
 
-        [Params(16, 128, 512)]
+        [Params(
+            64,
+            2048
+            )]
         public int Count { get; set; }
 
         [GlobalSetup]
@@ -40,7 +44,7 @@ public void Cleanup()
             this.source.Dispose();
         }
 
-        [Benchmark(Baseline = true)]
+        //[Benchmark]
         public void PerElement()
         {
             ref Vector4 s = ref MemoryMarshal.GetReference(this.source.GetSpan());
@@ -53,13 +57,13 @@ public void PerElement()
         }
 
         [Benchmark]
-        public void CommonBulk()
+        public void PixelOperations_Base()
         {
             new PixelOperations<TPixel>().PackFromVector4(this.source.GetSpan(), this.destination.GetSpan(), this.Count);
         }
 
         [Benchmark]
-        public void OptimizedBulk()
+        public void PixelOperations_Specialized()
         {
             PixelOperations<TPixel>.Instance.PackFromVector4(this.source.GetSpan(), this.destination.GetSpan(), this.Count);
         }
@@ -67,6 +71,58 @@ public void OptimizedBulk()
 
     public class PackFromVector4_Rgba32 : PackFromVector4<Rgba32>
     {
+        [Benchmark]
+        public void FallbackIntrinsics128()
+        {
+            Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
+            Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
+
+            SimdUtils.FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats);
+        }
+
+        [Benchmark(Baseline = true)]
+        public void BasicIntrinsics256()
+        {
+            Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
+            Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
+
+            SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats);
+        }
+
+        [Benchmark]
+        public void ExtendedIntrinsic()
+        {
+            Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
+            Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
+
+            SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats);
+        }
 
+        // RESULTS (2018 October):
+        //                       Method | Runtime | Count |         Mean |        Error |      StdDev | Scaled | ScaledSD |  Gen 0 | Allocated |
+        // ---------------------------- |-------- |------ |-------------:|-------------:|------------:|-------:|---------:|-------:|----------:|
+        //        FallbackIntrinsics128 |     Clr |    64 |    340.38 ns |    22.319 ns |   1.2611 ns |   1.41 |     0.01 |      - |       0 B |
+        //           BasicIntrinsics256 |     Clr |    64 |    240.79 ns |    11.421 ns |   0.6453 ns |   1.00 |     0.00 |      - |       0 B |
+        //            ExtendedIntrinsic |     Clr |    64 |    199.09 ns |   124.239 ns |   7.0198 ns |   0.83 |     0.02 |      - |       0 B |
+        //         PixelOperations_Base |     Clr |    64 |    647.99 ns |    24.003 ns |   1.3562 ns |   2.69 |     0.01 | 0.0067 |      24 B |
+        //  PixelOperations_Specialized |     Clr |    64 |    259.79 ns |    13.391 ns |   0.7566 ns |   1.08 |     0.00 |      - |       0 B | <--- ceremonial overhead has been minimized!
+        //                              |         |       |              |              |             |        |          |        |           |
+        //        FallbackIntrinsics128 |    Core |    64 |    234.64 ns |    12.320 ns |   0.6961 ns |   1.58 |     0.00 |      - |       0 B |
+        //           BasicIntrinsics256 |    Core |    64 |    148.87 ns |     2.794 ns |   0.1579 ns |   1.00 |     0.00 |      - |       0 B |
+        //            ExtendedIntrinsic |    Core |    64 |     94.06 ns |    10.015 ns |   0.5659 ns |   0.63 |     0.00 |      - |       0 B |
+        //         PixelOperations_Base |    Core |    64 |    573.52 ns |    31.865 ns |   1.8004 ns |   3.85 |     0.01 | 0.0067 |      24 B |
+        //  PixelOperations_Specialized |    Core |    64 |    117.21 ns |    13.264 ns |   0.7494 ns |   0.79 |     0.00 |      - |       0 B |
+        //                              |         |       |              |              |             |        |          |        |           |
+        //        FallbackIntrinsics128 |     Clr |  2048 |  6,735.93 ns | 2,139.340 ns | 120.8767 ns |   1.71 |     0.03 |      - |       0 B |
+        //           BasicIntrinsics256 |     Clr |  2048 |  3,929.29 ns |   334.027 ns |  18.8731 ns |   1.00 |     0.00 |      - |       0 B |
+        //            ExtendedIntrinsic |     Clr |  2048 |  2,226.01 ns |   130.525 ns |   7.3749 ns |!! 0.57 |     0.00 |      - |       0 B | <--- ExtendedIntrinsics rock!
+        //         PixelOperations_Base |     Clr |  2048 | 16,760.84 ns |   367.800 ns |  20.7814 ns |   4.27 |     0.02 |      - |      24 B | <--- Extra copies using "Vector4 TPixel.ToVector4()"
+        //  PixelOperations_Specialized |     Clr |  2048 |  3,986.03 ns |   237.238 ns |  13.4044 ns |   1.01 |     0.00 |      - |       0 B | <--- can't yet detect whether ExtendedIntrinsics are available :(
+        //                              |         |       |              |              |             |        |          |        |           |
+        //        FallbackIntrinsics128 |    Core |  2048 |  6,644.65 ns | 2,677.090 ns | 151.2605 ns |   1.69 |     0.05 |      - |       0 B |
+        //           BasicIntrinsics256 |    Core |  2048 |  3,923.70 ns | 1,971.760 ns | 111.4081 ns |   1.00 |     0.00 |      - |       0 B |
+        //            ExtendedIntrinsic |    Core |  2048 |  2,092.32 ns |   375.657 ns |  21.2253 ns |!! 0.53 |     0.01 |      - |       0 B | <--- ExtendedIntrinsics rock!
+        //         PixelOperations_Base |    Core |  2048 | 16,875.73 ns | 1,271.957 ns |  71.8679 ns |   4.30 |     0.10 |      - |      24 B |
+        //  PixelOperations_Specialized |    Core |  2048 |  2,129.92 ns |   262.888 ns |  14.8537 ns |!! 0.54 |     0.01 |      - |       0 B | <--- ExtendedIntrinsics rock!
     }
 }
\ No newline at end of file
diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
index 50fac25139..2cbe549e4a 100644
--- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
@@ -6,8 +6,14 @@
 using System.Buffers;
 using System;
 using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 
 using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Attributes.Jobs;
+using BenchmarkDotNet.Configs;
+using BenchmarkDotNet.Environments;
+using BenchmarkDotNet.Jobs;
 
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;
@@ -17,11 +23,17 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
     public abstract class ToVector4<TPixel>
         where TPixel : struct, IPixel<TPixel>
     {
-        private IMemoryOwner<TPixel> source;
+        protected IMemoryOwner<TPixel> source;
 
-        private IMemoryOwner<Vector4> destination;
+        protected IMemoryOwner<Vector4> destination;
 
-        [Params(64, 300, 1024)]
+        [Params(
+            64, 
+            //256,
+            //512,
+            //1024,
+            2048
+            )]
         public int Count { get; set; }
 
         [GlobalSetup]
@@ -38,7 +50,7 @@ public void Cleanup()
             this.destination.Dispose();
         }
 
-        [Benchmark(Baseline = true)]
+        //[Benchmark]
         public void PerElement()
         {
             Span<TPixel> s = this.source.GetSpan();
@@ -46,25 +58,163 @@ public void PerElement()
 
             for (int i = 0; i < this.Count; i++)
             {
-                TPixel c = s[i];
-                d[i] = c.ToVector4();
+                d[i] = s[i].ToVector4();
             }
         }
 
         [Benchmark]
-        public void CommonBulk()
+        public void PixelOperations_Base()
         {
             new PixelOperations<TPixel>().ToVector4(this.source.GetSpan(), this.destination.GetSpan(), this.Count);
         }
 
         [Benchmark]
-        public void OptimizedBulk()
+        public void PixelOperations_Specialized()
         {
             PixelOperations<TPixel>.Instance.ToVector4(this.source.GetSpan(), this.destination.GetSpan(), this.Count);
         }
     }
 
+    [Config(typeof(Config.ShortClr))]
     public class ToVector4_Rgba32 : ToVector4<Rgba32>
     {
+        [Benchmark]
+        public void FallbackIntrinsics128()
+        {
+            Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
+            Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
+
+            SimdUtils.FallbackIntrinsics128.BulkConvertByteToNormalizedFloat(sBytes, dFloats);
+        }
+        
+        [Benchmark(Baseline = true)]
+        public void BasicIntrinsics256()
+        {
+            Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
+            Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
+
+            SimdUtils.BasicIntrinsics256.BulkConvertByteToNormalizedFloat(sBytes, dFloats);
+        }
+
+        [Benchmark]
+        public void ExtendedIntrinsics()
+        {
+            Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
+            Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
+
+            SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(sBytes, dFloats);
+        }
+
+        //[Benchmark]
+        public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops()
+        {
+            Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
+            Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
+
+            int n = dFloats.Length / Vector<byte>.Count;
+
+            ref Vector<byte> sourceBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference((ReadOnlySpan<byte>)sBytes));
+            ref Vector<float> destBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(dFloats));
+            ref Vector<uint> destBaseU = ref Unsafe.As<Vector<float>, Vector<uint>>(ref destBase);
+
+            for (int i = 0; i < n; i++)
+            {
+                Vector<byte> b = Unsafe.Add(ref sourceBase, i);
+
+                Vector.Widen(b, out Vector<ushort> s0, out Vector<ushort> s1);
+                Vector.Widen(s0, out Vector<uint> w0, out Vector<uint> w1);
+                Vector.Widen(s1, out Vector<uint> w2, out Vector<uint> w3);
+
+                ref Vector<uint> d = ref Unsafe.Add(ref destBaseU, i * 4);
+                d = w0;
+                Unsafe.Add(ref d, 1) = w1;
+                Unsafe.Add(ref d, 2) = w2;
+                Unsafe.Add(ref d, 3) = w3;
+            }
+
+            n = dFloats.Length / Vector<float>.Count;
+            var scale = new Vector<float>(1f / 255f);
+
+            for (int i = 0; i < n; i++)
+            {
+                ref Vector<float> dRef = ref Unsafe.Add(ref destBase, i);
+
+                Vector<int> du = Vector.AsVectorInt32(dRef);
+                Vector<float> v = Vector.ConvertToSingle(du);
+                v *= scale;
+
+                dRef = v;
+            }
+        }
+
+        //[Benchmark]
+        public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_ConvertInSameLoop()
+        {
+            Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
+            Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
+
+            int n = dFloats.Length / Vector<byte>.Count;
+
+            ref Vector<byte> sourceBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference((ReadOnlySpan<byte>)sBytes));
+            ref Vector<float> destBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(dFloats));
+            var scale = new Vector<float>(1f / 255f);
+
+            for (int i = 0; i < n; i++)
+            {
+                Vector<byte> b = Unsafe.Add(ref sourceBase, i);
+
+                Vector.Widen(b, out Vector<ushort> s0, out Vector<ushort> s1);
+                Vector.Widen(s0, out Vector<uint> w0, out Vector<uint> w1);
+                Vector.Widen(s1, out Vector<uint> w2, out Vector<uint> w3);
+
+                Vector<float> f0 = ConvertToNormalizedSingle(w0, scale);
+                Vector<float> f1 = ConvertToNormalizedSingle(w1, scale);
+                Vector<float> f2 = ConvertToNormalizedSingle(w2, scale);
+                Vector<float> f3 = ConvertToNormalizedSingle(w3, scale);
+
+                ref Vector<float> d = ref Unsafe.Add(ref destBase, i * 4);
+                d = f0;
+                Unsafe.Add(ref d, 1) = f1;
+                Unsafe.Add(ref d, 2) = f2;
+                Unsafe.Add(ref d, 3) = f3;
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector<float> ConvertToNormalizedSingle(Vector<uint> u, Vector<float> scale)
+        {
+            Vector<int> vi = Vector.AsVectorInt32(u);
+            Vector<float> v = Vector.ConvertToSingle(vi);
+            v *= scale;
+            return v;
+        }
+
+        // RESULTS (2018 October):
+        //
+        //                       Method | Runtime | Count |        Mean |        Error |      StdDev | Scaled | ScaledSD |  Gen 0 | Allocated |
+        // ---------------------------- |-------- |------ |------------:|-------------:|------------:|-------:|---------:|-------:|----------:|
+        //        FallbackIntrinsics128 |     Clr |    64 |   287.62 ns |     6.026 ns |   0.3405 ns |   1.19 |     0.00 |      - |       0 B |
+        //           BasicIntrinsics256 |     Clr |    64 |   240.83 ns |    10.585 ns |   0.5981 ns |   1.00 |     0.00 |      - |       0 B |
+        //           ExtendedIntrinsics |     Clr |    64 |   168.28 ns |    11.478 ns |   0.6485 ns |   0.70 |     0.00 |      - |       0 B |
+        //         PixelOperations_Base |     Clr |    64 |   334.08 ns |    38.048 ns |   2.1498 ns |   1.39 |     0.01 | 0.0072 |      24 B |
+        //  PixelOperations_Specialized |     Clr |    64 |   255.41 ns |    10.939 ns |   0.6181 ns |   1.06 |     0.00 |      - |       0 B | <--- ceremonial overhead has been minimized!
+        //                              |         |       |             |              |             |        |          |        |           |
+        //        FallbackIntrinsics128 |    Core |    64 |   183.29 ns |     8.931 ns |   0.5046 ns |   1.32 |     0.00 |      - |       0 B |
+        //           BasicIntrinsics256 |    Core |    64 |   139.18 ns |     7.633 ns |   0.4313 ns |   1.00 |     0.00 |      - |       0 B |
+        //           ExtendedIntrinsics |    Core |    64 |    66.29 ns |    16.366 ns |   0.9247 ns |   0.48 |     0.01 |      - |       0 B |
+        //         PixelOperations_Base |    Core |    64 |   257.75 ns |    16.959 ns |   0.9582 ns |   1.85 |     0.01 | 0.0072 |      24 B |
+        //  PixelOperations_Specialized |    Core |    64 |    90.14 ns |     9.955 ns |   0.5625 ns |   0.65 |     0.00 |      - |       0 B |
+        //                              |         |       |             |              |             |        |          |        |           |
+        //        FallbackIntrinsics128 |     Clr |  2048 | 5,011.84 ns |   347.991 ns |  19.6621 ns |   1.22 |     0.01 |      - |       0 B |
+        //           BasicIntrinsics256 |     Clr |  2048 | 4,119.35 ns |   720.153 ns |  40.6900 ns |   1.00 |     0.00 |      - |       0 B |
+        //           ExtendedIntrinsics |     Clr |  2048 | 1,195.29 ns |   164.389 ns |   9.2883 ns |!! 0.29 |     0.00 |      - |       0 B | <--- ExtendedIntrinsics rock!
+        //         PixelOperations_Base |     Clr |  2048 | 6,820.58 ns |   823.433 ns |  46.5255 ns |   1.66 |     0.02 |      - |      24 B |
+        //  PixelOperations_Specialized |     Clr |  2048 | 4,203.53 ns |   176.714 ns |   9.9847 ns |   1.02 |     0.01 |      - |       0 B | <--- can't yet detect whether ExtendedIntrinsics are available :(
+        //                              |         |       |             |              |             |        |          |        |           |
+        //        FallbackIntrinsics128 |    Core |  2048 | 5,017.89 ns | 4,021.533 ns | 227.2241 ns |   1.24 |     0.05 |      - |       0 B |
+        //           BasicIntrinsics256 |    Core |  2048 | 4,046.51 ns | 1,150.390 ns |  64.9992 ns |   1.00 |     0.00 |      - |       0 B |
+        //           ExtendedIntrinsics |    Core |  2048 | 1,130.59 ns |   832.588 ns |  47.0427 ns |!! 0.28 |     0.01 |      - |       0 B | <--- ExtendedIntrinsics rock!
+        //         PixelOperations_Base |    Core |  2048 | 6,752.68 ns |   272.820 ns |  15.4148 ns |   1.67 |     0.02 |      - |      24 B |
+        //  PixelOperations_Specialized |    Core |  2048 | 1,126.13 ns |    79.192 ns |   4.4745 ns |!! 0.28 |     0.00 |      - |       0 B | <--- ExtendedIntrinsics rock!
     }
 }
\ No newline at end of file
diff --git a/tests/ImageSharp.Benchmarks/General/Abs.cs b/tests/ImageSharp.Benchmarks/General/BasicMath/Abs.cs
similarity index 88%
rename from tests/ImageSharp.Benchmarks/General/Abs.cs
rename to tests/ImageSharp.Benchmarks/General/BasicMath/Abs.cs
index a67f3f1078..ea53959b6a 100644
--- a/tests/ImageSharp.Benchmarks/General/Abs.cs
+++ b/tests/ImageSharp.Benchmarks/General/BasicMath/Abs.cs
@@ -1,9 +1,9 @@
-﻿namespace SixLabors.ImageSharp.Benchmarks.General
-{
-    using System;
+﻿using System;
 
-    using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Attributes;
 
+namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath
+{
     public class Abs
     {
         [Params(-1, 1)]
diff --git a/tests/ImageSharp.Benchmarks/General/BasicMath/ClampFloat.cs b/tests/ImageSharp.Benchmarks/General/BasicMath/ClampFloat.cs
new file mode 100644
index 0000000000..3b7dea0955
--- /dev/null
+++ b/tests/ImageSharp.Benchmarks/General/BasicMath/ClampFloat.cs
@@ -0,0 +1,70 @@
+﻿using System;
+using System.Runtime.CompilerServices;
+
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Running;
+
+namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath
+{
+    public class ClampFloat
+    {
+        private readonly float min = -1.5f;
+        private readonly float max = 2.5f;
+        private static readonly float[] Values = { -10, -5, -3, -1.5f, -0.5f, 0f, 1f, 1.5f, 2.5f, 3, 10 };
+
+        [Benchmark(Baseline = true)]
+        public float UsingMathF()
+        {
+            float acc = 0;
+
+            for (int i = 0; i < Values.Length; i++)
+            {
+                acc += ClampUsingMathF(Values[i], this.min, this.max);
+            }
+
+            return acc;
+        }
+
+        [Benchmark]
+        public float UsingBranching()
+        {
+            float acc = 0;
+
+            for (int i = 0; i < Values.Length; i++)
+            {
+                acc += ClampUsingBranching(Values[i], this.min, this.max);
+            }
+
+            return acc;
+        }
+
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static float ClampUsingMathF(float x, float min, float max)
+        {
+            return Math.Min(max, Math.Max(min, x));
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static float ClampUsingBranching(float x, float min, float max)
+        {
+            if (x >= max)
+            {
+                return max;
+            }
+
+            if (x <= min)
+            {
+                return min;
+            }
+
+            return x;
+        }
+
+        // RESULTS:
+        //          Method |     Mean |     Error |    StdDev | Scaled |
+        // --------------- |---------:|----------:|----------:|-------:|
+        //      UsingMathF | 30.37 ns | 0.3764 ns | 0.3337 ns |   1.00 |
+        //  UsingBranching | 18.66 ns | 0.1043 ns | 0.0871 ns |   0.61 |
+    }
+}
\ No newline at end of file
diff --git a/tests/ImageSharp.Benchmarks/General/Clamp.cs b/tests/ImageSharp.Benchmarks/General/BasicMath/ClampInt32IntoByte.cs
similarity index 93%
rename from tests/ImageSharp.Benchmarks/General/Clamp.cs
rename to tests/ImageSharp.Benchmarks/General/BasicMath/ClampInt32IntoByte.cs
index ef6bc3c402..6ce82ba115 100644
--- a/tests/ImageSharp.Benchmarks/General/Clamp.cs
+++ b/tests/ImageSharp.Benchmarks/General/BasicMath/ClampInt32IntoByte.cs
@@ -3,14 +3,14 @@
 // Licensed under the Apache License, Version 2.0.
 // </copyright>
 
-namespace SixLabors.ImageSharp.Benchmarks.General
-{
-    using System;
-    using System.Runtime.CompilerServices;
+using System;
+using System.Runtime.CompilerServices;
 
-    using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Attributes;
 
-    public class Clamp
+namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath
+{
+    public class ClampInt32IntoByte
     {
         [Params(-1, 0, 255, 256)]
         public int Value { get; set; }
diff --git a/tests/ImageSharp.Benchmarks/General/BasicMath/ModuloPowerOfTwoConstant.cs b/tests/ImageSharp.Benchmarks/General/BasicMath/ModuloPowerOfTwoConstant.cs
new file mode 100644
index 0000000000..9ddfad7222
--- /dev/null
+++ b/tests/ImageSharp.Benchmarks/General/BasicMath/ModuloPowerOfTwoConstant.cs
@@ -0,0 +1,23 @@
+﻿using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Attributes.Jobs;
+
+namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath
+{
+    [LongRunJob]
+    public class ModuloPowerOfTwoConstant
+    {
+        private readonly int value = 42;
+
+        [Benchmark(Baseline = true)]
+        public int Standard()
+        {
+            return this.value % 8;
+        }
+
+        [Benchmark]
+        public int Bitwise()
+        {
+            return ImageMaths.Modulo8(this.value);
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/ImageSharp.Benchmarks/General/BasicMath/ModuloPowerOfTwoVariable.cs b/tests/ImageSharp.Benchmarks/General/BasicMath/ModuloPowerOfTwoVariable.cs
new file mode 100644
index 0000000000..5c2fe81fa2
--- /dev/null
+++ b/tests/ImageSharp.Benchmarks/General/BasicMath/ModuloPowerOfTwoVariable.cs
@@ -0,0 +1,32 @@
+﻿using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Attributes.Jobs;
+
+namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath
+{
+    [LongRunJob]
+    public class ModuloPowerOfTwoVariable
+    {
+        private readonly int value = 42;
+
+        private readonly int m = 32;
+
+        [Benchmark(Baseline = true)]
+        public int Standard()
+        {
+            return this.value % this.m;
+        }
+
+        [Benchmark]
+        public int Bitwise()
+        {
+            return ImageMaths.ModuloP2(this.value, this.m);
+        }
+
+        // RESULTS:
+        //
+        //    Method |      Mean |     Error |    StdDev |    Median | Scaled | ScaledSD |
+        // --------- |----------:|----------:|----------:|----------:|-------:|---------:|
+        //  Standard | 1.2465 ns | 0.0093 ns | 0.0455 ns | 1.2423 ns |   1.00 |     0.00 |
+        //   Bitwise | 0.0265 ns | 0.0103 ns | 0.0515 ns | 0.0000 ns |   0.02 |     0.04 |
+    }
+}
\ No newline at end of file
diff --git a/tests/ImageSharp.Benchmarks/General/Pow.cs b/tests/ImageSharp.Benchmarks/General/BasicMath/Pow.cs
similarity index 93%
rename from tests/ImageSharp.Benchmarks/General/Pow.cs
rename to tests/ImageSharp.Benchmarks/General/BasicMath/Pow.cs
index 325bd9d20e..0f256fc781 100644
--- a/tests/ImageSharp.Benchmarks/General/Pow.cs
+++ b/tests/ImageSharp.Benchmarks/General/BasicMath/Pow.cs
@@ -1,7 +1,8 @@
 ﻿using System;
+
 using BenchmarkDotNet.Attributes;
 
-namespace SixLabors.ImageSharp.Benchmarks.General
+namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath
 {
     public class Pow
     {
diff --git a/tests/ImageSharp.Benchmarks/General/Modulus.cs b/tests/ImageSharp.Benchmarks/General/Modulus.cs
deleted file mode 100644
index e6d5ccce62..0000000000
--- a/tests/ImageSharp.Benchmarks/General/Modulus.cs
+++ /dev/null
@@ -1,19 +0,0 @@
-﻿namespace SixLabors.ImageSharp.Benchmarks.General
-{
-    using BenchmarkDotNet.Attributes;
-
-    public class Modulus
-    {
-        [Benchmark(Baseline = true, Description = "Standard Modulus using %")]
-        public int StandardModulus()
-        {
-            return 255 % 256;
-        }
-
-        [Benchmark(Description = "Bitwise Modulus using &")]
-        public int BitwiseModulus()
-        {
-            return 255 & 255;
-        }
-    }
-}
diff --git a/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs b/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs
new file mode 100644
index 0000000000..ca85a350cc
--- /dev/null
+++ b/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs
@@ -0,0 +1,113 @@
+﻿using System.Numerics;
+using System.Runtime.CompilerServices;
+
+using BenchmarkDotNet.Attributes;
+
+namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
+{
+    [Config(typeof(Config.ShortClr))]
+    public class UInt32ToSingle
+    {
+        private float[] data;
+
+        private const int Count = 32;
+
+        [GlobalSetup]
+        public void Setup()
+        {
+            this.data = new float[Count];
+        }
+
+        [Benchmark(Baseline = true)]
+        public void MagicMethod()
+        {
+            ref Vector<float> b = ref Unsafe.As<float, Vector<float>>(ref this.data[0]);
+
+            int n = Count / Vector<float>.Count;
+
+            var bVec = new Vector<float>(256.0f / 255.0f);
+            var magicFloat = new Vector<float>(32768.0f);
+            var magicInt = new Vector<uint>(1191182336); // reinterpreded value of 32768.0f
+            var mask = new Vector<uint>(255);
+
+            for (int i = 0; i < n; i++)
+            {
+                // union { float f; uint32_t i; } u;
+                // u.f = 32768.0f + x * (255.0f / 256.0f);
+                // return (uint8_t)u.i;
+
+                ref Vector<float> df = ref Unsafe.Add(ref b, i);
+
+                var vi = Vector.AsVectorUInt32(df);
+                vi &= mask;
+                vi |= magicInt;
+
+                var vf = Vector.AsVectorSingle(vi);
+                vf = (vf - magicFloat) * bVec;
+
+                df = vf;
+            }
+        }
+
+        [Benchmark]
+        public void StandardSimd()
+        {
+            int n = Count / Vector<float>.Count;
+
+            ref Vector<float> bf = ref Unsafe.As<float, Vector<float>>(ref this.data[0]);
+            ref Vector<uint> bu = ref Unsafe.As<Vector<float>, Vector<uint>>(ref bf);
+
+            var scale = new Vector<float>(1f / 255f);
+
+            for (int i = 0; i < n; i++)
+            {
+                Vector<uint> u = Unsafe.Add(ref bu, i);
+                Vector<float> v = Vector.ConvertToSingle(u);
+                v *= scale;
+                Unsafe.Add(ref bf, i) = v;
+            }
+        }
+        
+        [Benchmark]
+        public void StandardSimdFromInt()
+        {
+            int n = Count / Vector<float>.Count;
+
+            ref Vector<float> bf = ref Unsafe.As<float, Vector<float>>(ref this.data[0]);
+            ref Vector<int> bu = ref Unsafe.As<Vector<float>, Vector<int>>(ref bf);
+
+            var scale = new Vector<float>(1f / 255f);
+
+            for (int i = 0; i < n; i++)
+            {
+                Vector<int> u = Unsafe.Add(ref bu, i);
+                Vector<float> v = Vector.ConvertToSingle(u);
+                v *= scale;
+                Unsafe.Add(ref bf, i) = v;
+            }
+        }
+
+
+        [Benchmark]
+        public void StandardSimdFromInt_RefCast()
+        {
+            int n = Count / Vector<float>.Count;
+
+            ref Vector<float> bf = ref Unsafe.As<float, Vector<float>>(ref this.data[0]);
+            ref Vector<int> bu = ref Unsafe.As<Vector<float>, Vector<int>>(ref bf);
+
+            var scale = new Vector<float>(1f / 255f);
+
+            for (int i = 0; i < n; i++)
+            {
+                ref Vector<float> fRef = ref Unsafe.Add(ref bf, i);
+
+                Vector<int> du = Vector.AsVectorInt32(fRef);
+                Vector<float> v = Vector.ConvertToSingle(du);
+                v *= scale;
+
+                fRef = v;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs b/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs
new file mode 100644
index 0000000000..2bc3af4c98
--- /dev/null
+++ b/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs
@@ -0,0 +1,64 @@
+﻿using System.Numerics;
+using System.Runtime.CompilerServices;
+
+using BenchmarkDotNet.Attributes;
+
+using SixLabors.ImageSharp.Tuples;
+
+namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
+{
+    [Config(typeof(Config.ShortClr))]
+    public class WidenBytesToUInt32
+    {
+        private byte[] source;
+
+        private uint[] dest;
+
+        private const int Count = 64;
+
+        [GlobalSetup]
+        public void Setup()
+        {
+            this.source = new byte[Count];
+            this.dest = new uint[Count];
+        }
+
+        [Benchmark(Baseline = true)]
+        public void Standard()
+        {
+            const int N = Count / 8;
+
+            ref Octet.OfByte sBase = ref Unsafe.As<byte, Octet.OfByte>(ref this.source[0]);
+            ref Octet.OfUInt32 dBase = ref Unsafe.As<uint, Octet.OfUInt32>(ref this.dest[0]);
+
+            for (int i = 0; i < N; i++)
+            {
+                Unsafe.Add(ref dBase, i).LoadFrom(ref Unsafe.Add(ref sBase, i));
+            }
+        }
+
+        [Benchmark]
+        public void Simd()
+        {
+            int n = Count / Vector<byte>.Count;
+
+            ref Vector<byte> sBase = ref Unsafe.As<byte, Vector<byte>>(ref this.source[0]);
+            ref Vector<uint> dBase = ref Unsafe.As<uint, Vector<uint>>(ref this.dest[0]);
+
+            for (int i = 0; i < n; i++)
+            {
+                Vector<byte> b = Unsafe.Add(ref sBase, i);
+
+                Vector.Widen(b, out Vector<ushort> s0, out Vector<ushort> s1);
+                Vector.Widen(s0, out Vector<uint> w0, out Vector<uint> w1);
+                Vector.Widen(s1, out Vector<uint> w2, out Vector<uint> w3);
+
+                ref Vector<uint> d = ref Unsafe.Add(ref dBase, i * 4);
+                d = w0;
+                Unsafe.Add(ref d, 1) = w1;
+                Unsafe.Add(ref d, 2) = w2;
+                Unsafe.Add(ref d, 3) = w3;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
index c6c3b68f33..c63cb3438f 100644
--- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
@@ -62,7 +62,7 @@ private static Vector<float> CreateRandomTestVector(int seed, float min, float m
         {
             float[] data = new float[Vector<float>.Count];
 
-            var rnd = new Random();
+            var rnd = new Random(seed);
 
             for (int i = 0; i < Vector<float>.Count; i++)
             {
@@ -118,7 +118,7 @@ private bool SkipOnNonAvx2([CallerMemberName] string testCaseName = null)
         [InlineData(1, 8)]
         [InlineData(2, 16)]
         [InlineData(3, 128)]
-        public void BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count)
+        public void BasicIntrinsics256_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count)
         {
             if (this.SkipOnNonAvx2())
             {
@@ -130,7 +130,7 @@ public void BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count
 
             byte[] dest = new byte[count];
 
-            SimdUtils.BulkConvertNormalizedFloatToByte(normalized, dest);
+            SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByte(normalized, dest);
 
             byte[] expected = orig.Select(f => (byte)(f)).ToArray();
 
@@ -142,7 +142,7 @@ public void BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count
         [InlineData(1, 8)]
         [InlineData(2, 16)]
         [InlineData(3, 128)]
-        public void BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count)
+        public void BasicIntrinsics256_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count)
         {
             if (this.SkipOnNonAvx2())
             {
@@ -153,39 +153,147 @@ public void BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int co
 
             byte[] dest = new byte[count];
 
-            SimdUtils.BulkConvertNormalizedFloatToByte(source, dest);
+            SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByte(source, dest);
 
             byte[] expected = source.Select(f => (byte)Math.Round(f * 255f)).ToArray();
 
             Assert.Equal(expected, dest);
         }
 
-        private static float Clamp255(float x) => Math.Min(255f, Math.Max(0f, x));
+        public static readonly TheoryData<int> ArraySizesDivisibleBy8 = new TheoryData<int> { 0, 8, 16, 1024 };
+        public static readonly TheoryData<int> ArraySizesDivisibleBy4 = new TheoryData<int> { 0, 4, 8, 28, 1020 };
+
+        public static readonly TheoryData<int> ArraySizesDivisibleBy32 = new TheoryData<int> { 0, 32, 512 };
+
+        public static readonly TheoryData<int> ArbitraryArraySizes =
+            new TheoryData<int>
+                {
+                    0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 520,
+                };
 
         [Theory]
-        [InlineData(1, 0)]
-        [InlineData(1, 8)]
-        [InlineData(2, 16)]
-        [InlineData(3, 128)]
-        public void BulkConvertNormalizedFloatToByteClampOverflows(int seed, int count)
+        [MemberData(nameof(ArraySizesDivisibleBy4))]
+        public void FallbackIntrinsics128_BulkConvertByteToNormalizedFloat(int count)
+        {
+            TestImpl_BulkConvertByteToNormalizedFloat(
+                count,
+                (s, d) => SimdUtils.FallbackIntrinsics128.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesDivisibleBy8))]
+        public void BasicIntrinsics256_BulkConvertByteToNormalizedFloat(int count)
         {
             if (this.SkipOnNonAvx2())
             {
                 return;
             }
 
-            float[] orig = new Random(seed).GenerateRandomRoundedFloatArray(count, -50, 444);
-            float[] normalized = orig.Select(f => f / 255f).ToArray();
+            TestImpl_BulkConvertByteToNormalizedFloat(
+                count,
+                (s, d) => SimdUtils.BasicIntrinsics256.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
+        }
+        
+        [Theory]
+        [MemberData(nameof(ArraySizesDivisibleBy32))]
+        public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int count)
+        {
+            TestImpl_BulkConvertByteToNormalizedFloat(
+                count,
+                (s, d) => SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
+        }
 
-            byte[] dest = new byte[count];
+        [Theory]
+        [MemberData(nameof(ArbitraryArraySizes))]
+        public void BulkConvertByteToNormalizedFloat(int count)
+        {
+            TestImpl_BulkConvertByteToNormalizedFloat(
+                count,
+                (s, d) => SimdUtils.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
+        }
 
-            SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(normalized, dest);
+        private static void TestImpl_BulkConvertByteToNormalizedFloat(
+            int count,
+            Action<Memory<byte>, Memory<float>> convert)
+        {
+            byte[] source = new Random(count).GenerateRandomByteArray(count);
+            float[] result = new float[count];
+            float[] expected = source.Select(b => (float)b / 255f).ToArray();
 
-            byte[] expected = orig.Select(f => (byte)Clamp255(f)).ToArray();
+            convert(source, result);
 
-            Assert.Equal(expected, dest);
+            Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
         }
 
+        [Theory]
+        [MemberData(nameof(ArraySizesDivisibleBy4))]
+        public void FallbackIntrinsics128_BulkConvertNormalizedFloatToByteClampOverflows(int count)
+        {
+            TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
+                (s, d) => SimdUtils.FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesDivisibleBy8))]
+        public void BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows(int count)
+        {
+            if (this.SkipOnNonAvx2())
+            {
+                return;
+            }
+
+            TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
+                (s, d) => SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
+                );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesDivisibleBy32))]
+        public void ExtendedIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count)
+        {
+            TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
+                (s, d) => SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArbitraryArraySizes))]
+        public void BulkConvertNormalizedFloatToByteClampOverflows(int count)
+        {
+            TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
+                (s, d) => SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
+            );
+
+            // for small values, let's stress test the implementation a bit:
+            if (count > 0 && count < 10)
+            {
+                for (int i = 0; i < 20; i++)
+                {
+                    TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(
+                        count,
+                        (s, d) => SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span),
+                        i + 42);
+                }
+            }
+        }
+
+        private static void TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(
+            int count,
+            Action<Memory<float>, Memory<byte>> convert, int seed = -1)
+        {
+            seed = seed > 0 ? seed : count;
+            float[] source = new Random(seed).GenerateRandomFloatArray(count, -0.2f, 1.2f);
+            byte[] expected = source.Select(NormalizedFloatToByte).ToArray();
+            byte[] actual = new byte[count];
+
+            convert(source, actual);
+
+            Assert.Equal(expected, actual);
+        }
+
+        private static byte NormalizedFloatToByte(float f) => (byte)Math.Min(255f, Math.Max(0f, f * 255f + 0.5f));
+
         [Theory]
         [InlineData(0)]
         [InlineData(7)]
@@ -211,7 +319,7 @@ private void BulkConvertNormalizedFloatToByte_Step()
 
             float[] source = { 0, 7, 42, 255, 0.5f, 1.1f, 2.6f, 16f };
 
-            var expected = source.Select(f => (byte)Math.Round(f)).ToArray();
+            byte[] expected = source.Select(f => (byte)Math.Round(f)).ToArray();
 
             source = source.Select(f => f / 255f).ToArray();
 
@@ -245,8 +353,6 @@ private void MagicConvert(Span<float> source, Span<byte> dest)
 
             iiRef = x;
 
-            //Tuple8.OfUInt32 ii = Unsafe.As<Vector<float>, Tuple8.OfUInt32>(ref x);
-
             ref Tuple8.OfByte d = ref MemoryMarshal.Cast<byte, Tuple8.OfByte>(dest)[0];
             d.LoadFrom(ref ii);
 
diff --git a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs
index 6c2979fe9e..75ef611a5c 100644
--- a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs
+++ b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs
@@ -9,6 +9,74 @@ namespace SixLabors.ImageSharp.Tests.Helpers
 
     public class ImageMathsTests
     {
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        [InlineData(2)]
+        [InlineData(3)]
+        [InlineData(4)]
+        [InlineData(100)]
+        [InlineData(123)]
+        [InlineData(53436353)]
+        public void Modulo4(int x)
+        {
+            int actual = ImageMaths.Modulo4(x);
+            Assert.Equal(x % 4, actual);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        [InlineData(2)]
+        [InlineData(6)]
+        [InlineData(7)]
+        [InlineData(8)]
+        [InlineData(100)]
+        [InlineData(123)]
+        [InlineData(53436353)]
+        [InlineData(975)]
+        public void Modulo8(int x)
+        {
+            int actual = ImageMaths.Modulo8(x);
+            Assert.Equal(x % 8, actual);
+        }
+
+        [Theory]
+        [InlineData(0, 2)]
+        [InlineData(1, 2)]
+        [InlineData(2, 2)]
+        [InlineData(0, 4)]
+        [InlineData(3, 4)]
+        [InlineData(5, 4)]
+        [InlineData(5, 8)]
+        [InlineData(8, 8)]
+        [InlineData(8, 16)]
+        [InlineData(15, 16)]
+        [InlineData(17, 16)]
+        [InlineData(17, 32)]
+        [InlineData(31, 32)]
+        [InlineData(32, 32)]
+        [InlineData(33, 32)]
+        public void Modulo2P(int x, int m)
+        {
+            int actual = ImageMaths.ModuloP2(x, m);
+            Assert.Equal(x % m, actual);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, 0)]
+        [InlineData(0.5f, 0, 1, 0.5f)]
+        [InlineData(-0.5f, -0.1f, 10, -0.1f)]
+        [InlineData(-0.05f, -0.1f, 10, -0.05f)]
+        [InlineData(9.9f, -0.1f, 10, 9.9f)]
+        [InlineData(10f, -0.1f, 10, 10f)]
+        [InlineData(10.1f, -0.1f, 10, 10f)]
+        public void Clamp(float x, float min, float max, float expected)
+        {
+            float actual = x.Clamp(min, max);
+            Assert.Equal(expected, actual);
+        }
+
         [Fact]
         public void FasAbsResultMatchesMath()
         {
diff --git a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs
index 9e41fd94f3..abf764881b 100644
--- a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs
+++ b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs
@@ -1,93 +1,73 @@
-﻿// Copyright (c) Six Labors and contributors.
-// Licensed under the Apache License, Version 2.0.
-
-using System;
-using System.Buffers;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-using SixLabors.ImageSharp.Memory;
-using SixLabors.ImageSharp.PixelFormats;
-using Xunit;
-using Xunit.Abstractions;
-
-namespace SixLabors.ImageSharp.Tests.PixelFormats
-{
-    public partial class PixelOperationsTests
-    {
-        public class Rgba32 : PixelOperationsTests<ImageSharp.PixelFormats.Rgba32>
-        {
-            public Rgba32(ITestOutputHelper output)
-                : base(output)
-            {
-            }
-
-            // For 4.6 test runner MemberData does not work without redeclaring the public field in the derived test class:
-            public static new TheoryData<int> ArraySizesData => new TheoryData<int> { 7, 16, 1111 };
-
-            [Fact]
-            public void IsSpecialImplementation()
-            {
-                Assert.IsType<ImageSharp.PixelFormats.Rgba32.PixelOperations>(PixelOperations<ImageSharp.PixelFormats.Rgba32>.Instance);
-            }
-
-            [Fact]
-            public void ToVector4SimdAligned()
-            {
-                if (!Vector.IsHardwareAccelerated)
-                {
-                    return;
-                }
-
-                ImageSharp.PixelFormats.Rgba32[] source = CreatePixelTestData(64);
-                Vector4[] expected = CreateExpectedVector4Data(source);
-
-                TestOperation(
-                    source,
-                    expected,
-                    (s, d) => ImageSharp.PixelFormats.Rgba32.PixelOperations.ToVector4SimdAligned(s, d.GetSpan(), 64)
-                );
-            }
-
-
-            // [Fact] // Profiling benchmark - enable manually!
-#pragma warning disable xUnit1013 // Public method should be marked as test
-            public void Benchmark_ToVector4()
-#pragma warning restore xUnit1013 // Public method should be marked as test
-            {
-                int times = 200000;
-                int count = 1024;
-
-                using (IMemoryOwner<ImageSharp.PixelFormats.Rgba32> source = Configuration.Default.MemoryAllocator.Allocate<ImageSharp.PixelFormats.Rgba32>(count))
-                using (IMemoryOwner<Vector4> dest = Configuration.Default.MemoryAllocator.Allocate<Vector4>(count))
-                {
-                    this.Measure(
-                        times,
-                        () =>
-                            {
-                                PixelOperations<ImageSharp.PixelFormats.Rgba32>.Instance.ToVector4(source.GetSpan(), dest.GetSpan(), count);
-                            });
-                }
-            }
-        }
-
-        public class Argb32 : PixelOperationsTests<ImageSharp.PixelFormats.Argb32>
-        {
-            // For 4.6 test runner MemberData does not work without redeclaring the public field in the derived test class:
-            public Argb32(ITestOutputHelper output)
-                : base(output)
-            {
-            }
-
-            public static new TheoryData<int> ArraySizesData => new TheoryData<int> { 7, 16, 1111 };
-        }
-
-        [Theory]
-        [WithBlankImages(1, 1, PixelTypes.All)]
-        public void GetGlobalInstance<TPixel>(TestImageProvider<TPixel> dummy)
-            where TPixel : struct, IPixel<TPixel>
-        {
-            Assert.NotNull(PixelOperations<TPixel>.Instance);
+﻿// Copyright (c) Six Labors and contributors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Buffers;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using SixLabors.ImageSharp.Memory;
+using SixLabors.ImageSharp.PixelFormats;
+using Xunit;
+using Xunit.Abstractions;
+
+namespace SixLabors.ImageSharp.Tests.PixelFormats
+{
+    public partial class PixelOperationsTests
+    {
+        public class Rgba32 : PixelOperationsTests<ImageSharp.PixelFormats.Rgba32>
+        {
+            public const string SkipProfilingBenchmarks =
+#if true
+                "Profiling benchmark - enable manually!";
+#else
+                null;
+#endif
+
+            public Rgba32(ITestOutputHelper output)
+                : base(output)
+            {
+            }
+
+            [Fact]
+            public void IsSpecialImplementation()
+            {
+                Assert.IsType<ImageSharp.PixelFormats.Rgba32.PixelOperations>(PixelOperations<ImageSharp.PixelFormats.Rgba32>.Instance);
+            }
+
+            [Fact(Skip = SkipProfilingBenchmarks)]
+            public void Benchmark_ToVector4()
+            {
+                int times = 200000;
+                int count = 1024;
+
+                using (IMemoryOwner<ImageSharp.PixelFormats.Rgba32> source = Configuration.Default.MemoryAllocator.Allocate<ImageSharp.PixelFormats.Rgba32>(count))
+                using (IMemoryOwner<Vector4> dest = Configuration.Default.MemoryAllocator.Allocate<Vector4>(count))
+                {
+                    this.Measure(
+                        times,
+                        () =>
+                            {
+                                PixelOperations<ImageSharp.PixelFormats.Rgba32>.Instance.ToVector4(source.GetSpan(), dest.GetSpan(), count);
+                            });
+                }
+            }
+        }
+
+        public class Argb32 : PixelOperationsTests<ImageSharp.PixelFormats.Argb32>
+        {
+            public Argb32(ITestOutputHelper output)
+                : base(output)
+            {
+            }
+        }
+
+        [Theory]
+        [WithBlankImages(1, 1, PixelTypes.All)]
+        public void GetGlobalInstance<TPixel>(TestImageProvider<TPixel> dummy)
+            where TPixel : struct, IPixel<TPixel>
+        {
+            Assert.NotNull(PixelOperations<TPixel>.Instance);
         }
 
         [Fact]
@@ -99,594 +79,594 @@ public void IsOpaqueColor()
             Assert.False(new GraphicsOptions(true).IsOpaqueColorWithoutBlending(ImageSharp.PixelFormats.Rgba32.Transparent));
             Assert.False(new GraphicsOptions(true, PixelColorBlendingMode.Lighten, 1).IsOpaqueColorWithoutBlending(ImageSharp.PixelFormats.Rgba32.Red));
             Assert.False(new GraphicsOptions(true, PixelColorBlendingMode.Normal,PixelAlphaCompositionMode.DestOver, 1).IsOpaqueColorWithoutBlending(ImageSharp.PixelFormats.Rgba32.Red));
-        }
-    }
-
-    public abstract class PixelOperationsTests<TPixel> : MeasureFixture
-        where TPixel : struct, IPixel<TPixel>
-    {
-        protected PixelOperationsTests(ITestOutputHelper output)
-            : base(output)
-        {
-        }
-
-        public static TheoryData<int> ArraySizesData => new TheoryData<int> { 7, 16, 1111 };
-
-        private static PixelOperations<TPixel> Operations => PixelOperations<TPixel>.Instance;
-
-        internal static TPixel[] CreateExpectedPixelData(Vector4[] source)
-        {
-            var expected = new TPixel[source.Length];
-
-            for (int i = 0; i < expected.Length; i++)
-            {
-                expected[i].PackFromVector4(source[i]);
-            }
-            return expected;
-        }
-
-        internal static TPixel[] CreateScaledExpectedPixelData(Vector4[] source)
-        {
-            var expected = new TPixel[source.Length];
-
-            for (int i = 0; i < expected.Length; i++)
-            {
-                expected[i].PackFromScaledVector4(source[i]);
-            }
-            return expected;
-        }
-
-        [Theory]
-        [MemberData(nameof(ArraySizesData))]
-        public void PackFromVector4(int count)
-        {
-            Vector4[] source = CreateVector4TestData(count);
-            TPixel[] expected = CreateExpectedPixelData(source);
-
-            TestOperation(
-                source,
-                expected,
-                (s, d) => Operations.PackFromVector4(s, d.GetSpan(), count)
-            );
-        }
-
-        [Theory]
-        [MemberData(nameof(ArraySizesData))]
-        public void PackFromScaledVector4(int count)
-        {
-            Vector4[] source = CreateVector4TestData(count);
-            TPixel[] expected = CreateScaledExpectedPixelData(source);
-
-            TestOperation(
-                source,
-                expected,
-                (s, d) => Operations.PackFromScaledVector4(s, d.GetSpan(), count)
-            );
-        }
-
-        internal static Vector4[] CreateExpectedVector4Data(TPixel[] source)
-        {
-            var expected = new Vector4[source.Length];
-
-            for (int i = 0; i < expected.Length; i++)
-            {
-                expected[i] = source[i].ToVector4();
-            }
-            return expected;
-        }
-
-        internal static Vector4[] CreateExpectedScaledVector4Data(TPixel[] source)
-        {
-            var expected = new Vector4[source.Length];
-
-            for (int i = 0; i < expected.Length; i++)
-            {
-                expected[i] = source[i].ToScaledVector4();
-            }
-            return expected;
-        }
-
-        [Theory]
-        [MemberData(nameof(ArraySizesData))]
-        public void ToVector4(int count)
-        {
-            TPixel[] source = CreatePixelTestData(count);
-            Vector4[] expected = CreateExpectedVector4Data(source);
-
-            TestOperation(
-                source,
-                expected,
-                (s, d) => Operations.ToVector4(s, d.GetSpan(), count)
-            );
-        }
-
-        [Theory]
-        [MemberData(nameof(ArraySizesData))]
-        public void ToScaledVector4(int count)
-        {
-            TPixel[] source = CreateScaledPixelTestData(count);
-            Vector4[] expected = CreateExpectedScaledVector4Data(source);
-
-            TestOperation(
-                source,
-                expected,
-                (s, d) => Operations.ToScaledVector4(s, d.GetSpan(), count)
-            );
-        }
-
-        [Theory]
-        [MemberData(nameof(ArraySizesData))]
-        public void PackFromRgb24Bytes(int count)
-        {
-            byte[] source = CreateByteTestData(count * 3);
-            var expected = new TPixel[count];
-
-            for (int i = 0; i < count; i++)
-            {
-                int i3 = i * 3;
-
-                expected[i].PackFromRgba32(new Rgba32(source[i3 + 0], source[i3 + 1], source[i3 + 2], 255));
-            }
-
-            TestOperation(
-                source,
-                expected,
-                (s, d) => Operations.PackFromRgb24Bytes(s, d.GetSpan(), count)
-            );
-        }
-
-        [Theory]
-        [MemberData(nameof(ArraySizesData))]
-        public void ToRgb24Bytes(int count)
-        {
-            TPixel[] source = CreatePixelTestData(count);
-            byte[] expected = new byte[count * 3];
-            var rgb = default(Rgb24);
-
-            for (int i = 0; i < count; i++)
-            {
-                int i3 = i * 3;
-                source[i].ToRgb24(ref rgb);
-                expected[i3] = rgb.R;
-                expected[i3 + 1] = rgb.G;
-                expected[i3 + 2] = rgb.B;
-            }
-
-            TestOperation(
-                source,
-                expected,
-                (s, d) => Operations.ToRgb24Bytes(s, d.GetSpan(), count)
-            );
-        }
-
-        [Theory]
-        [MemberData(nameof(ArraySizesData))]
-        public void PackFromRgba32Bytes(int count)
-        {
-            byte[] source = CreateByteTestData(count * 4);
-            var expected = new TPixel[count];
-
-            for (int i = 0; i < count; i++)
-            {
-                int i4 = i * 4;
-
-                expected[i].PackFromRgba32(new Rgba32(source[i4 + 0], source[i4 + 1], source[i4 + 2], source[i4 + 3]));
-            }
-
-            TestOperation(
-                source,
-                expected,
-                (s, d) => Operations.PackFromRgba32Bytes(s, d.GetSpan(), count)
-            );
-        }
-
-        [Theory]
-        [MemberData(nameof(ArraySizesData))]
-        public void ToRgba32Bytes(int count)
-        {
-            TPixel[] source = CreatePixelTestData(count);
-            byte[] expected = new byte[count * 4];
-            var rgba = default(Rgba32);
-
-            for (int i = 0; i < count; i++)
-            {
-                int i4 = i * 4;
-                source[i].ToRgba32(ref rgba);
-                expected[i4] = rgba.R;
-                expected[i4 + 1] = rgba.G;
-                expected[i4 + 2] = rgba.B;
-                expected[i4 + 3] = rgba.A;
-            }
-
-            TestOperation(
-                source,
-                expected,
-                (s, d) => Operations.ToRgba32Bytes(s, d.GetSpan(), count)
-            );
-        }
-
-        [Theory]
-        [MemberData(nameof(ArraySizesData))]
-        public void PackFromRgb48Bytes(int count)
-        {
-            byte[] source = CreateByteTestData(count * 6);
-            Span<byte> sourceSpan = source.AsSpan();
-            var expected = new TPixel[count];
-
-            var rgba64 = new Rgba64(0, 0, 0, 65535);
-            for (int i = 0; i < count; i++)
-            {
-                int i6 = i * 6;
-                rgba64.Rgb = MemoryMarshal.Cast<byte, Rgb48>(sourceSpan.Slice(i6, 6))[0];
-                expected[i].PackFromRgba64(rgba64);
-            }
-
-            TestOperation(
-                source,
-                expected,
-                (s, d) => Operations.PackFromRgb48Bytes(s, d.GetSpan(), count)
-            );
-        }
-
-        [Theory]
-        [MemberData(nameof(ArraySizesData))]
-        public void ToRgb48Bytes(int count)
-        {
-            TPixel[] source = CreatePixelTestData(count);
-            byte[] expected = new byte[count * 6];
-            Rgb48 rgb = default;
-
-            for (int i = 0; i < count; i++)
-            {
-                int i6 = i * 6;
-                source[i].ToRgb48(ref rgb);
-                Rgba64Bytes rgb48Bytes = Unsafe.As<Rgb48, Rgba64Bytes>(ref rgb);
-                expected[i6] = rgb48Bytes[0];
-                expected[i6 + 1] = rgb48Bytes[1];
-                expected[i6 + 2] = rgb48Bytes[2];
-                expected[i6 + 3] = rgb48Bytes[3];
-                expected[i6 + 4] = rgb48Bytes[4];
-                expected[i6 + 5] = rgb48Bytes[5];
-            }
-
-            TestOperation(
-                source,
-                expected,
-                (s, d) => Operations.ToRgb48Bytes(s, d.GetSpan(), count)
-            );
-        }
-
-        [Theory]
-        [MemberData(nameof(ArraySizesData))]
-        public void PackFromRgba64Bytes(int count)
-        {
-            byte[] source = CreateByteTestData(count * 8);
-            Span<byte> sourceSpan = source.AsSpan();
-            var expected = new TPixel[count];
-
-            for (int i = 0; i < count; i++)
-            {
-                int i8 = i * 8;
-                expected[i].PackFromRgba64(MemoryMarshal.Cast<byte, Rgba64>(sourceSpan.Slice(i8, 8))[0]);
-            }
-
-            TestOperation(
-                source,
-                expected,
-                (s, d) => Operations.PackFromRgba64Bytes(s, d.GetSpan(), count)
-            );
-        }
-
-        [Theory]
-        [MemberData(nameof(ArraySizesData))]
-        public void ToRgba64Bytes(int count)
-        {
-            TPixel[] source = CreatePixelTestData(count);
-            byte[] expected = new byte[count * 8];
-            Rgba64 rgba = default;
-
-            for (int i = 0; i < count; i++)
-            {
-                int i8 = i * 8;
-                source[i].ToRgba64(ref rgba);
-                Rgba64Bytes rgba64Bytes = Unsafe.As<Rgba64, Rgba64Bytes>(ref rgba);
-                expected[i8] = rgba64Bytes[0];
-                expected[i8 + 1] = rgba64Bytes[1];
-                expected[i8 + 2] = rgba64Bytes[2];
-                expected[i8 + 3] = rgba64Bytes[3];
-                expected[i8 + 4] = rgba64Bytes[4];
-                expected[i8 + 5] = rgba64Bytes[5];
-                expected[i8 + 6] = rgba64Bytes[6];
-                expected[i8 + 7] = rgba64Bytes[7];
-            }
-
-            TestOperation(
-                source,
-                expected,
-                (s, d) => Operations.ToRgba64Bytes(s, d.GetSpan(), count)
-            );
-        }
-
-        [Theory]
-        [MemberData(nameof(ArraySizesData))]
-        public void PackFromBgr24Bytes(int count)
-        {
-            byte[] source = CreateByteTestData(count * 3);
-            var expected = new TPixel[count];
-
-            for (int i = 0; i < count; i++)
-            {
-                int i3 = i * 3;
-
-                expected[i].PackFromRgba32(new Rgba32(source[i3 + 2], source[i3 + 1], source[i3 + 0], 255));
-            }
-
-            TestOperation(
-                source,
-                expected,
-                (s, d) => Operations.PackFromBgr24Bytes(s, d.GetSpan(), count)
-            );
-        }
-
-        [Theory]
-        [MemberData(nameof(ArraySizesData))]
-        public void ToBgr24Bytes(int count)
-        {
-            TPixel[] source = CreatePixelTestData(count);
-            byte[] expected = new byte[count * 3];
-            var bgr = default(Bgr24);
-
-            for (int i = 0; i < count; i++)
-            {
-                int i3 = i * 3;
-                source[i].ToBgr24(ref bgr);
-                expected[i3] = bgr.B;
-                expected[i3 + 1] = bgr.G;
-                expected[i3 + 2] = bgr.R;
-            }
-
-            TestOperation(
-                source,
-                expected,
-                (s, d) => Operations.ToBgr24Bytes(s, d.GetSpan(), count)
-            );
-        }
-
-        [Theory]
-        [MemberData(nameof(ArraySizesData))]
-        public void PackFromBgra32Bytes(int count)
-        {
-            byte[] source = CreateByteTestData(count * 4);
-            var expected = new TPixel[count];
-
-            for (int i = 0; i < count; i++)
-            {
-                int i4 = i * 4;
-
-                expected[i].PackFromRgba32(new Rgba32(source[i4 + 2], source[i4 + 1], source[i4 + 0], source[i4 + 3]));
-            }
-
-            TestOperation(
-                source,
-                expected,
-                (s, d) => Operations.PackFromBgra32Bytes(s, d.GetSpan(), count)
-            );
-        }
-
-        [Theory]
-        [MemberData(nameof(ArraySizesData))]
-        public void ToZyxwBytes(int count)
-        {
-            TPixel[] source = CreatePixelTestData(count);
-            byte[] expected = new byte[count * 4];
-            var bgra = default(Bgra32);
-
-            for (int i = 0; i < count; i++)
-            {
-                int i4 = i * 4;
-                source[i].ToBgra32(ref bgra);
-                expected[i4] = bgra.B;
-                expected[i4 + 1] = bgra.G;
-                expected[i4 + 2] = bgra.R;
-                expected[i4 + 3] = bgra.A;
-            }
-
-            TestOperation(
-                source,
-                expected,
-                (s, d) => Operations.ToBgra32Bytes(s, d.GetSpan(), count)
-            );
-        }
-
-        [Theory]
-        [MemberData(nameof(ArraySizesData))]
-        public void PackFromArgb32Bytes(int count)
-        {
-            byte[] source = CreateByteTestData(count * 4);
-            var expected = new TPixel[count];
-
-            for (int i = 0; i < count; i++)
-            {
-                int i4 = i * 4;
-
-                expected[i].PackFromRgba32(new Rgba32(source[i4 + 1], source[i4 + 2], source[i4 + 3], source[i4 + 0]));
-            }
-
-            TestOperation(
-                source,
-                expected,
-                (s, d) => Operations.PackFromArgb32Bytes(s, d.GetSpan(), count)
-            );
-        }
-
-        [Theory]
-        [MemberData(nameof(ArraySizesData))]
-        public void ToArgb32Bytes(int count)
-        {
-            TPixel[] source = CreatePixelTestData(count);
-            byte[] expected = new byte[count * 4];
-            var argb = default(Argb32);
-
-            for (int i = 0; i < count; i++)
-            {
-                int i4 = i * 4;
-                source[i].ToArgb32(ref argb);
-                expected[i4] = argb.A;
-                expected[i4 + 1] = argb.R;
-                expected[i4 + 2] = argb.G;
-                expected[i4 + 3] = argb.B;
-            }
-
-            TestOperation(
-                source,
-                expected,
-                (s, d) => Operations.ToArgb32Bytes(s, d.GetSpan(), count)
-            );
-        }
-
-        private class TestBuffers<TSource, TDest> : IDisposable
-            where TSource : struct
-            where TDest : struct
-        {
-            public TSource[] SourceBuffer { get; }
-            public IMemoryOwner<TDest> ActualDestBuffer { get; }
-            public TDest[] ExpectedDestBuffer { get; }
-
-            public TestBuffers(TSource[] source, TDest[] expectedDest)
-            {
-                this.SourceBuffer = source;
-                this.ExpectedDestBuffer = expectedDest;
-                this.ActualDestBuffer = Configuration.Default.MemoryAllocator.Allocate<TDest>(expectedDest.Length);
-            }
-
-            public void Dispose()
-            {
-                this.ActualDestBuffer.Dispose();
-            }
-
-            private const float Tolerance = 0.0001f;
-
-            public void Verify()
-            {
-                int count = this.ExpectedDestBuffer.Length;
-
-                if (typeof(TDest) == typeof(Vector4))
-                {
-
-                    Span<Vector4> expected = MemoryMarshal.Cast<TDest, Vector4>(this.ExpectedDestBuffer.AsSpan());
-                    Span<Vector4> actual = MemoryMarshal.Cast<TDest, Vector4>(this.ActualDestBuffer.GetSpan());
-
-                    for (int i = 0; i < count; i++)
-                    {
-                        // ReSharper disable PossibleNullReferenceException
-                        Assert.Equal(expected[i], actual[i], new ApproximateFloatComparer(0.001f));
-                        // ReSharper restore PossibleNullReferenceException
-                    }
-                }
-                else
-                {
-                    Span<TDest> expected = this.ExpectedDestBuffer.AsSpan();
-                    Span<TDest> actual = this.ActualDestBuffer.GetSpan();
-                    for (int i = 0; i < count; i++)
-                    {
-                        Assert.Equal(expected[i], actual[i]);
-                    }
-                }
-            }
-        }
-
-        internal static void TestOperation<TSource, TDest>(
-            TSource[] source,
-            TDest[] expected,
-            Action<TSource[], IMemoryOwner<TDest>> action)
-            where TSource : struct
-            where TDest : struct
-        {
-            using (var buffers = new TestBuffers<TSource, TDest>(source, expected))
-            {
-                action(buffers.SourceBuffer, buffers.ActualDestBuffer);
-                buffers.Verify();
-            }
-        }
-
-        internal static Vector4[] CreateVector4TestData(int length)
-        {
-            var result = new Vector4[length];
-            var rnd = new Random(42); // Deterministic random values
-
-            for (int i = 0; i < result.Length; i++)
-            {
-                result[i] = GetVector(rnd);
-            }
-            return result;
-        }
-
-        internal static TPixel[] CreatePixelTestData(int length)
-        {
-            var result = new TPixel[length];
-
-            var rnd = new Random(42); // Deterministic random values
-
-            for (int i = 0; i < result.Length; i++)
-            {
-                Vector4 v = GetVector(rnd);
-                result[i].PackFromVector4(v);
-            }
-
-            return result;
-        }
-
-        internal static TPixel[] CreateScaledPixelTestData(int length)
-        {
-            var result = new TPixel[length];
-
-            var rnd = new Random(42); // Deterministic random values
-
-            for (int i = 0; i < result.Length; i++)
-            {
-                Vector4 v = GetVector(rnd);
-                result[i].PackFromScaledVector4(v);
-            }
-
-            return result;
-        }
-
-        internal static byte[] CreateByteTestData(int length)
-        {
-            byte[] result = new byte[length];
-            var rnd = new Random(42); // Deterministic random values
-
-            for (int i = 0; i < result.Length; i++)
-            {
-                result[i] = (byte)rnd.Next(255);
-            }
-            return result;
-        }
-
-        internal static Vector4 GetVector(Random rnd)
-        {
-            return new Vector4(
-                (float)rnd.NextDouble(),
-                (float)rnd.NextDouble(),
-                (float)rnd.NextDouble(),
-                (float)rnd.NextDouble()
-            );
-        }
-
-        [StructLayout(LayoutKind.Sequential)]
-        private unsafe struct Rgba64Bytes
-        {
-            public fixed byte Data[8];
-
-            public byte this[int idx]
-            {
-                [MethodImpl(MethodImplOptions.AggressiveInlining)]
-                get
-                {
-                    ref byte self = ref Unsafe.As<Rgba64Bytes, byte>(ref this);
-                    return Unsafe.Add(ref self, idx);
-                }
-            }
-        }
-    }
+        }
+    }
+
+    public abstract class PixelOperationsTests<TPixel> : MeasureFixture
+        where TPixel : struct, IPixel<TPixel>
+    {
+        protected PixelOperationsTests(ITestOutputHelper output)
+            : base(output)
+        {
+        }
+
+        public static TheoryData<int> ArraySizesData => new TheoryData<int> { 0, 1, 2, 7, 16, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 1111 };
+
+        private static PixelOperations<TPixel> Operations => PixelOperations<TPixel>.Instance;
+
+        internal static TPixel[] CreateExpectedPixelData(Vector4[] source)
+        {
+            var expected = new TPixel[source.Length];
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i].PackFromVector4(source[i]);
+            }
+            return expected;
+        }
+
+        internal static TPixel[] CreateScaledExpectedPixelData(Vector4[] source)
+        {
+            var expected = new TPixel[source.Length];
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i].PackFromScaledVector4(source[i]);
+            }
+            return expected;
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesData))]
+        public void PackFromVector4(int count)
+        {
+            Vector4[] source = CreateVector4TestData(count);
+            TPixel[] expected = CreateExpectedPixelData(source);
+
+            TestOperation(
+                source,
+                expected,
+                (s, d) => Operations.PackFromVector4(s, d.GetSpan(), count)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesData))]
+        public void PackFromScaledVector4(int count)
+        {
+            Vector4[] source = CreateVector4TestData(count);
+            TPixel[] expected = CreateScaledExpectedPixelData(source);
+
+            TestOperation(
+                source,
+                expected,
+                (s, d) => Operations.PackFromScaledVector4(s, d.GetSpan(), count)
+            );
+        }
+
+        internal static Vector4[] CreateExpectedVector4Data(TPixel[] source)
+        {
+            var expected = new Vector4[source.Length];
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] = source[i].ToVector4();
+            }
+            return expected;
+        }
+
+        internal static Vector4[] CreateExpectedScaledVector4Data(TPixel[] source)
+        {
+            var expected = new Vector4[source.Length];
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] = source[i].ToScaledVector4();
+            }
+            return expected;
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesData))]
+        public void ToVector4(int count)
+        {
+            TPixel[] source = CreatePixelTestData(count);
+            Vector4[] expected = CreateExpectedVector4Data(source);
+
+            TestOperation(
+                source,
+                expected,
+                (s, d) => Operations.ToVector4(s, d.GetSpan(), count)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesData))]
+        public void ToScaledVector4(int count)
+        {
+            TPixel[] source = CreateScaledPixelTestData(count);
+            Vector4[] expected = CreateExpectedScaledVector4Data(source);
+
+            TestOperation(
+                source,
+                expected,
+                (s, d) => Operations.ToScaledVector4(s, d.GetSpan(), count)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesData))]
+        public void PackFromRgb24Bytes(int count)
+        {
+            byte[] source = CreateByteTestData(count * 3);
+            var expected = new TPixel[count];
+
+            for (int i = 0; i < count; i++)
+            {
+                int i3 = i * 3;
+
+                expected[i].PackFromRgba32(new Rgba32(source[i3 + 0], source[i3 + 1], source[i3 + 2], 255));
+            }
+
+            TestOperation(
+                source,
+                expected,
+                (s, d) => Operations.PackFromRgb24Bytes(s, d.GetSpan(), count)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesData))]
+        public void ToRgb24Bytes(int count)
+        {
+            TPixel[] source = CreatePixelTestData(count);
+            byte[] expected = new byte[count * 3];
+            var rgb = default(Rgb24);
+
+            for (int i = 0; i < count; i++)
+            {
+                int i3 = i * 3;
+                source[i].ToRgb24(ref rgb);
+                expected[i3] = rgb.R;
+                expected[i3 + 1] = rgb.G;
+                expected[i3 + 2] = rgb.B;
+            }
+
+            TestOperation(
+                source,
+                expected,
+                (s, d) => Operations.ToRgb24Bytes(s, d.GetSpan(), count)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesData))]
+        public void PackFromRgba32Bytes(int count)
+        {
+            byte[] source = CreateByteTestData(count * 4);
+            var expected = new TPixel[count];
+
+            for (int i = 0; i < count; i++)
+            {
+                int i4 = i * 4;
+
+                expected[i].PackFromRgba32(new Rgba32(source[i4 + 0], source[i4 + 1], source[i4 + 2], source[i4 + 3]));
+            }
+
+            TestOperation(
+                source,
+                expected,
+                (s, d) => Operations.PackFromRgba32Bytes(s, d.GetSpan(), count)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesData))]
+        public void ToRgba32Bytes(int count)
+        {
+            TPixel[] source = CreatePixelTestData(count);
+            byte[] expected = new byte[count * 4];
+            var rgba = default(Rgba32);
+
+            for (int i = 0; i < count; i++)
+            {
+                int i4 = i * 4;
+                source[i].ToRgba32(ref rgba);
+                expected[i4] = rgba.R;
+                expected[i4 + 1] = rgba.G;
+                expected[i4 + 2] = rgba.B;
+                expected[i4 + 3] = rgba.A;
+            }
+
+            TestOperation(
+                source,
+                expected,
+                (s, d) => Operations.ToRgba32Bytes(s, d.GetSpan(), count)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesData))]
+        public void PackFromRgb48Bytes(int count)
+        {
+            byte[] source = CreateByteTestData(count * 6);
+            Span<byte> sourceSpan = source.AsSpan();
+            var expected = new TPixel[count];
+
+            var rgba64 = new Rgba64(0, 0, 0, 65535);
+            for (int i = 0; i < count; i++)
+            {
+                int i6 = i * 6;
+                rgba64.Rgb = MemoryMarshal.Cast<byte, Rgb48>(sourceSpan.Slice(i6, 6))[0];
+                expected[i].PackFromRgba64(rgba64);
+            }
+
+            TestOperation(
+                source,
+                expected,
+                (s, d) => Operations.PackFromRgb48Bytes(s, d.GetSpan(), count)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesData))]
+        public void ToRgb48Bytes(int count)
+        {
+            TPixel[] source = CreatePixelTestData(count);
+            byte[] expected = new byte[count * 6];
+            Rgb48 rgb = default;
+
+            for (int i = 0; i < count; i++)
+            {
+                int i6 = i * 6;
+                source[i].ToRgb48(ref rgb);
+                Rgba64Bytes rgb48Bytes = Unsafe.As<Rgb48, Rgba64Bytes>(ref rgb);
+                expected[i6] = rgb48Bytes[0];
+                expected[i6 + 1] = rgb48Bytes[1];
+                expected[i6 + 2] = rgb48Bytes[2];
+                expected[i6 + 3] = rgb48Bytes[3];
+                expected[i6 + 4] = rgb48Bytes[4];
+                expected[i6 + 5] = rgb48Bytes[5];
+            }
+
+            TestOperation(
+                source,
+                expected,
+                (s, d) => Operations.ToRgb48Bytes(s, d.GetSpan(), count)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesData))]
+        public void PackFromRgba64Bytes(int count)
+        {
+            byte[] source = CreateByteTestData(count * 8);
+            Span<byte> sourceSpan = source.AsSpan();
+            var expected = new TPixel[count];
+
+            for (int i = 0; i < count; i++)
+            {
+                int i8 = i * 8;
+                expected[i].PackFromRgba64(MemoryMarshal.Cast<byte, Rgba64>(sourceSpan.Slice(i8, 8))[0]);
+            }
+
+            TestOperation(
+                source,
+                expected,
+                (s, d) => Operations.PackFromRgba64Bytes(s, d.GetSpan(), count)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesData))]
+        public void ToRgba64Bytes(int count)
+        {
+            TPixel[] source = CreatePixelTestData(count);
+            byte[] expected = new byte[count * 8];
+            Rgba64 rgba = default;
+
+            for (int i = 0; i < count; i++)
+            {
+                int i8 = i * 8;
+                source[i].ToRgba64(ref rgba);
+                Rgba64Bytes rgba64Bytes = Unsafe.As<Rgba64, Rgba64Bytes>(ref rgba);
+                expected[i8] = rgba64Bytes[0];
+                expected[i8 + 1] = rgba64Bytes[1];
+                expected[i8 + 2] = rgba64Bytes[2];
+                expected[i8 + 3] = rgba64Bytes[3];
+                expected[i8 + 4] = rgba64Bytes[4];
+                expected[i8 + 5] = rgba64Bytes[5];
+                expected[i8 + 6] = rgba64Bytes[6];
+                expected[i8 + 7] = rgba64Bytes[7];
+            }
+
+            TestOperation(
+                source,
+                expected,
+                (s, d) => Operations.ToRgba64Bytes(s, d.GetSpan(), count)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesData))]
+        public void PackFromBgr24Bytes(int count)
+        {
+            byte[] source = CreateByteTestData(count * 3);
+            var expected = new TPixel[count];
+
+            for (int i = 0; i < count; i++)
+            {
+                int i3 = i * 3;
+
+                expected[i].PackFromRgba32(new Rgba32(source[i3 + 2], source[i3 + 1], source[i3 + 0], 255));
+            }
+
+            TestOperation(
+                source,
+                expected,
+                (s, d) => Operations.PackFromBgr24Bytes(s, d.GetSpan(), count)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesData))]
+        public void ToBgr24Bytes(int count)
+        {
+            TPixel[] source = CreatePixelTestData(count);
+            byte[] expected = new byte[count * 3];
+            var bgr = default(Bgr24);
+
+            for (int i = 0; i < count; i++)
+            {
+                int i3 = i * 3;
+                source[i].ToBgr24(ref bgr);
+                expected[i3] = bgr.B;
+                expected[i3 + 1] = bgr.G;
+                expected[i3 + 2] = bgr.R;
+            }
+
+            TestOperation(
+                source,
+                expected,
+                (s, d) => Operations.ToBgr24Bytes(s, d.GetSpan(), count)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesData))]
+        public void PackFromBgra32Bytes(int count)
+        {
+            byte[] source = CreateByteTestData(count * 4);
+            var expected = new TPixel[count];
+
+            for (int i = 0; i < count; i++)
+            {
+                int i4 = i * 4;
+
+                expected[i].PackFromRgba32(new Rgba32(source[i4 + 2], source[i4 + 1], source[i4 + 0], source[i4 + 3]));
+            }
+
+            TestOperation(
+                source,
+                expected,
+                (s, d) => Operations.PackFromBgra32Bytes(s, d.GetSpan(), count)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesData))]
+        public void ToZyxwBytes(int count)
+        {
+            TPixel[] source = CreatePixelTestData(count);
+            byte[] expected = new byte[count * 4];
+            var bgra = default(Bgra32);
+
+            for (int i = 0; i < count; i++)
+            {
+                int i4 = i * 4;
+                source[i].ToBgra32(ref bgra);
+                expected[i4] = bgra.B;
+                expected[i4 + 1] = bgra.G;
+                expected[i4 + 2] = bgra.R;
+                expected[i4 + 3] = bgra.A;
+            }
+
+            TestOperation(
+                source,
+                expected,
+                (s, d) => Operations.ToBgra32Bytes(s, d.GetSpan(), count)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesData))]
+        public void PackFromArgb32Bytes(int count)
+        {
+            byte[] source = CreateByteTestData(count * 4);
+            var expected = new TPixel[count];
+
+            for (int i = 0; i < count; i++)
+            {
+                int i4 = i * 4;
+
+                expected[i].PackFromRgba32(new Rgba32(source[i4 + 1], source[i4 + 2], source[i4 + 3], source[i4 + 0]));
+            }
+
+            TestOperation(
+                source,
+                expected,
+                (s, d) => Operations.PackFromArgb32Bytes(s, d.GetSpan(), count)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArraySizesData))]
+        public void ToArgb32Bytes(int count)
+        {
+            TPixel[] source = CreatePixelTestData(count);
+            byte[] expected = new byte[count * 4];
+            var argb = default(Argb32);
+
+            for (int i = 0; i < count; i++)
+            {
+                int i4 = i * 4;
+                source[i].ToArgb32(ref argb);
+                expected[i4] = argb.A;
+                expected[i4 + 1] = argb.R;
+                expected[i4 + 2] = argb.G;
+                expected[i4 + 3] = argb.B;
+            }
+
+            TestOperation(
+                source,
+                expected,
+                (s, d) => Operations.ToArgb32Bytes(s, d.GetSpan(), count)
+            );
+        }
+
+        private class TestBuffers<TSource, TDest> : IDisposable
+            where TSource : struct
+            where TDest : struct
+        {
+            public TSource[] SourceBuffer { get; }
+            public IMemoryOwner<TDest> ActualDestBuffer { get; }
+            public TDest[] ExpectedDestBuffer { get; }
+
+            public TestBuffers(TSource[] source, TDest[] expectedDest)
+            {
+                this.SourceBuffer = source;
+                this.ExpectedDestBuffer = expectedDest;
+                this.ActualDestBuffer = Configuration.Default.MemoryAllocator.Allocate<TDest>(expectedDest.Length);
+            }
+
+            public void Dispose()
+            {
+                this.ActualDestBuffer.Dispose();
+            }
+
+            private const float Tolerance = 0.0001f;
+
+            public void Verify()
+            {
+                int count = this.ExpectedDestBuffer.Length;
+
+                if (typeof(TDest) == typeof(Vector4))
+                {
+
+                    Span<Vector4> expected = MemoryMarshal.Cast<TDest, Vector4>(this.ExpectedDestBuffer.AsSpan());
+                    Span<Vector4> actual = MemoryMarshal.Cast<TDest, Vector4>(this.ActualDestBuffer.GetSpan());
+
+                    for (int i = 0; i < count; i++)
+                    {
+                        // ReSharper disable PossibleNullReferenceException
+                        Assert.Equal(expected[i], actual[i], new ApproximateFloatComparer(0.001f));
+                        // ReSharper restore PossibleNullReferenceException
+                    }
+                }
+                else
+                {
+                    Span<TDest> expected = this.ExpectedDestBuffer.AsSpan();
+                    Span<TDest> actual = this.ActualDestBuffer.GetSpan();
+                    for (int i = 0; i < count; i++)
+                    {
+                        Assert.Equal(expected[i], actual[i]);
+                    }
+                }
+            }
+        }
+
+        internal static void TestOperation<TSource, TDest>(
+            TSource[] source,
+            TDest[] expected,
+            Action<TSource[], IMemoryOwner<TDest>> action)
+            where TSource : struct
+            where TDest : struct
+        {
+            using (var buffers = new TestBuffers<TSource, TDest>(source, expected))
+            {
+                action(buffers.SourceBuffer, buffers.ActualDestBuffer);
+                buffers.Verify();
+            }
+        }
+
+        internal static Vector4[] CreateVector4TestData(int length)
+        {
+            var result = new Vector4[length];
+            var rnd = new Random(42); // Deterministic random values
+
+            for (int i = 0; i < result.Length; i++)
+            {
+                result[i] = GetVector(rnd);
+            }
+            return result;
+        }
+
+        internal static TPixel[] CreatePixelTestData(int length)
+        {
+            var result = new TPixel[length];
+
+            var rnd = new Random(42); // Deterministic random values
+
+            for (int i = 0; i < result.Length; i++)
+            {
+                Vector4 v = GetVector(rnd);
+                result[i].PackFromVector4(v);
+            }
+
+            return result;
+        }
+
+        internal static TPixel[] CreateScaledPixelTestData(int length)
+        {
+            var result = new TPixel[length];
+
+            var rnd = new Random(42); // Deterministic random values
+
+            for (int i = 0; i < result.Length; i++)
+            {
+                Vector4 v = GetVector(rnd);
+                result[i].PackFromScaledVector4(v);
+            }
+
+            return result;
+        }
+
+        internal static byte[] CreateByteTestData(int length)
+        {
+            byte[] result = new byte[length];
+            var rnd = new Random(42); // Deterministic random values
+
+            for (int i = 0; i < result.Length; i++)
+            {
+                result[i] = (byte)rnd.Next(255);
+            }
+            return result;
+        }
+
+        internal static Vector4 GetVector(Random rnd)
+        {
+            return new Vector4(
+                (float)rnd.NextDouble(),
+                (float)rnd.NextDouble(),
+                (float)rnd.NextDouble(),
+                (float)rnd.NextDouble()
+            );
+        }
+
+        [StructLayout(LayoutKind.Sequential)]
+        private unsafe struct Rgba64Bytes
+        {
+            public fixed byte Data[8];
+
+            public byte this[int idx]
+            {
+                [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                get
+                {
+                    ref byte self = ref Unsafe.As<Rgba64Bytes, byte>(ref this);
+                    return Unsafe.Add(ref self, idx);
+                }
+            }
+        }
+    }
 }
\ No newline at end of file
diff --git a/tests/ImageSharp.Tests/TestUtilities/TestDataGenerator.cs b/tests/ImageSharp.Tests/TestUtilities/TestDataGenerator.cs
index 0b1b89cc00..912b86e347 100644
--- a/tests/ImageSharp.Tests/TestUtilities/TestDataGenerator.cs
+++ b/tests/ImageSharp.Tests/TestUtilities/TestDataGenerator.cs
@@ -33,20 +33,28 @@ public static Vector4[] GenerateRandomVectorArray(this Random rnd, int length, f
             return values;
         }
 
-        public static float[] GenerateRandomRoundedFloatArray(this Random rnd, int length, int minVal, int maxValExclusive)
+        public static float[] GenerateRandomRoundedFloatArray(this Random rnd, int length, float minVal, float maxVal)
         {
             float[] values = new float[length];
 
             for (int i = 0; i < length; i++)
             {
-                int val = rnd.Next(minVal, maxValExclusive);
-                values[i] = (float)val;
+                values[i] = (float) Math.Round(rnd.GetRandomFloat(minVal, maxVal));
             }
 
             return values;
         }
 
-        private static float GetRandomFloat(Random rnd, float minVal, float maxVal)
+
+
+        public static byte[] GenerateRandomByteArray(this Random rnd, int length)
+        {
+            byte[] values = new byte[length];
+            rnd.NextBytes(values);
+            return values;
+        }
+
+        private static float GetRandomFloat(this Random rnd, float minVal, float maxVal)
         {
             return (float)rnd.NextDouble() * (maxVal - minVal) + minVal;
         }
diff --git a/tests/ImageSharp.Tests/TestUtilities/Tests/TestEnvironmentTests.cs b/tests/ImageSharp.Tests/TestUtilities/Tests/TestEnvironmentTests.cs
index 8a3e69059f..30bb16c2a0 100644
--- a/tests/ImageSharp.Tests/TestUtilities/Tests/TestEnvironmentTests.cs
+++ b/tests/ImageSharp.Tests/TestUtilities/Tests/TestEnvironmentTests.cs
@@ -3,6 +3,8 @@
 
 using System;
 using System.IO;
+using System.Reflection;
+using System.Runtime.InteropServices;
 
 using SixLabors.ImageSharp.Common.Helpers;
 using SixLabors.ImageSharp.Formats;