SixLabors · JimBobSquarePants · Feb 20, 2023 · Feb 17, 2023 · Feb 17, 2023 · Feb 19, 2023
diff --git a/src/ImageSharp/Common/Constants.cs b/src/ImageSharp/Common/Constants.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.
 
 namespace SixLabors.ImageSharp;

diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -532,7 +532,8 @@ private static void Shuffle4Slice3(
         }
 
         /// <summary>
-        /// Performs a multiplication and an addition of the <see cref="Vector256{T}"/>.
+        /// Performs a multiplication and an addition of the <see cref="Vector256{Single}"/>.
+        /// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
         /// </summary>
         /// <remarks>ret = (vm0 * vm1) + va</remarks>
         /// <param name="va">The vector to add to the intermediate result.</param>
@@ -549,22 +550,21 @@ public static Vector256<float> MultiplyAdd(
             {
                 return Fma.MultiplyAdd(vm1, vm0, va);
             }
-            else
-            {
-                return Avx.Add(Avx.Multiply(vm0, vm1), va);
-            }
+
+            return Avx.Add(Avx.Multiply(vm0, vm1), va);
         }
 
         /// <summary>
-        /// Performs a multiplication and a substraction of the <see cref="Vector256{T}"/>.
+        /// Performs a multiplication and a subtraction of the <see cref="Vector256{Single}"/>.
+        /// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
         /// </summary>
         /// <remarks>ret = (vm0 * vm1) - vs</remarks>
-        /// <param name="vs">The vector to substract from the intermediate result.</param>
+        /// <param name="vs">The vector to subtract from the intermediate result.</param>
         /// <param name="vm0">The first vector to multiply.</param>
         /// <param name="vm1">The second vector to multiply.</param>
         /// <returns>The <see cref="Vector256{T}"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static Vector256<float> MultiplySubstract(
+        public static Vector256<float> MultiplySubtract(
             in Vector256<float> vs,
             in Vector256<float> vm0,
             in Vector256<float> vm1)
@@ -573,10 +573,30 @@ public static Vector256<float> MultiplySubstract(
             {
                 return Fma.MultiplySubtract(vm1, vm0, vs);
             }
-            else
+
+            return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
+        }
+
+        /// <summary>
+        /// Performs a multiplication and a negated addition of the <see cref="Vector256{Single}"/>.
+        /// </summary>
+        /// <remarks>ret = c - (a * b)</remarks>
+        /// <param name="a">The first vector to multiply.</param>
+        /// <param name="b">The second vector to multiply.</param>
+        /// <param name="c">The vector to add negated to the intermediate result.</param>
+        /// <returns>The <see cref="Vector256{T}"/>.</returns>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static Vector256<float> MultiplyAddNegated(
+            in Vector256<float> a,
+            in Vector256<float> b,
+            in Vector256<float> c)
+        {
+            if (Fma.IsSupported)
             {
-                return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
+                return Fma.MultiplyAddNegated(a, b, c);
             }
+
+            return Avx.Subtract(c, Avx.Multiply(a, b));
         }
 
         /// <summary>

diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
@@ -99,7 +99,7 @@ static void IDCT8x8_1D_Avx(ref Block8x8F block)
 
             var mm256_F_1_4142 = Vector256.Create(1.414213562f);
             Vector256<float> tmp13 = Avx.Add(tmp1, tmp3);
-            Vector256<float> tmp12 = SimdUtils.HwIntrinsics.MultiplySubstract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
+            Vector256<float> tmp12 = SimdUtils.HwIntrinsics.MultiplySubtract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
 
             tmp0 = Avx.Add(tmp10, tmp13);
             tmp3 = Avx.Subtract(tmp10, tmp13);

diff --git a/src/ImageSharp/PixelFormats/PixelBlenders/DefaultPixelBlenders.Generated.cs b/src/ImageSharp/PixelFormats/PixelBlenders/DefaultPixelBlenders.Generated.cs
diff --git a/src/ImageSharp/PixelFormats/PixelBlenders/DefaultPixelBlenders.Generated.tt b/src/ImageSharp/PixelFormats/PixelBlenders/DefaultPixelBlenders.Generated.tt
@@ -13,6 +13,10 @@
 
 // <auto-generated />
 using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
 
 namespace SixLabors.ImageSharp.PixelFormats.PixelBlenders;
 
@@ -86,18 +90,85 @@ var blenders = new []{
         protected override void BlendFunction(Span<Vector4> destination, ReadOnlySpan<Vector4> background, ReadOnlySpan<Vector4> source, float amount)
         {
             amount = Numerics.Clamp(amount, 0, 1);
-            for (int i = 0; i < destination.Length; i++)
+
+            if (Avx2.IsSupported && destination.Length >= 2)
             {
-                destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], amount);
+                // Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
+                ref Vector256<float> destinationBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
+                ref Vector256<float> destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));
+
+                ref Vector256<float> backgroundBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(background));
+                ref Vector256<float> sourceBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(source));
+                Vector256<float> opacity = Vector256.Create(amount);
+
+                while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
+                {
+                    destinationBase = PorterDuffFunctions.<#=blender_composer#>(backgroundBase, sourceBase, opacity);
+                    destinationBase = ref Unsafe.Add(ref destinationBase, 1);
+                    backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
+                    sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+                }
+
+                if (Numerics.Modulo2(destination.Length) != 0)
+                {
+                    // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+                    int i = destination.Length - 1;
+                    destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], amount);
+                }
+            }
+            else
+            {
+                for (int i = 0; i < destination.Length; i++)
+                {
+                    destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], amount);
+                }
             }
         }
 
         /// <inheritdoc />
         protected override void BlendFunction(Span<Vector4> destination, ReadOnlySpan<Vector4> background, ReadOnlySpan<Vector4> source, ReadOnlySpan<float> amount)
         {
-            for (int i = 0; i < destination.Length; i++)
+            if (Avx2.IsSupported && destination.Length >= 2)
+            {
+                // Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
+                ref Vector256<float> destinationBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
+                ref Vector256<float> destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));
+
+                ref Vector256<float> backgroundBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(background));
+                ref Vector256<float> sourceBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(source));
+                ref float amountBase = ref MemoryMarshal.GetReference(amount);
+
+                Vector256<float> vOne = Vector256.Create(1F);
+
+                while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
+                {
+                    // We need to create a Vector256<float> containing the current and next amount values
+                    // taking up each half of the Vector256<float> and then clamp them.
+                    Vector256<float> opacity = Vector256.Create(
+                        Vector128.Create(amountBase),
+                        Vector128.Create(Unsafe.Add(ref amountBase, 1)));
+                    opacity = Avx.Min(Avx.Max(Vector256<float>.Zero, opacity), vOne);
+
+                    destinationBase = PorterDuffFunctions.<#=blender_composer#>(backgroundBase, sourceBase, opacity);
+                    destinationBase = ref Unsafe.Add(ref destinationBase, 1);
+                    backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
+                    sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+                    amountBase = ref Unsafe.Add(ref amountBase, 2);
+                }
+
+                if (Numerics.Modulo2(destination.Length) != 0)
+                {
+                    // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+                    int i = destination.Length - 1;
+                    destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
+                }
+            }
+            else
             {
-                destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], Numerics.Clamp(amount[i], 0, 1));
+                for (int i = 0; i < destination.Length; i++)
+                {
+                    destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
+                }
             }
         }
     }