SixLabors · JimBobSquarePants · Feb 26, 2023 · Feb 23, 2023 · Feb 24, 2023 · Feb 24, 2023
diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -474,21 +474,10 @@ private static void ClampImpl<T>(Span<T> span, T min, T max)
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static void Premultiply(ref Vector4 source)
     {
-        float w = source.W;
-        source *= w;
-        source.W = w;
-    }
-
-    /// <summary>
-    /// Reverses the result of premultiplying a vector via <see cref="Premultiply(ref Vector4)"/>.
-    /// </summary>
-    /// <param name="source">The <see cref="Vector4"/> to premultiply</param>
-    [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static void UnPremultiply(ref Vector4 source)
-    {
-        float w = source.W;
-        source /= w;
-        source.W = w;
+        // Load into a local variable to prevent accessing the source from memory multiple times.
+        Vector4 src = source;
+        Vector4 alpha = PermuteW(src);
+        source = WithW(src * alpha, alpha);
     }
 
     /// <summary>
@@ -498,7 +487,7 @@ public static void UnPremultiply(ref Vector4 source)
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static void Premultiply(Span<Vector4> vectors)
     {
-        if (Avx2.IsSupported && vectors.Length >= 2)
+        if (Avx.IsSupported && vectors.Length >= 2)
         {
             // Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
             ref Vector256<float> vectorsBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(vectors));
@@ -507,8 +496,8 @@ public static void Premultiply(Span<Vector4> vectors)
             while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast))
             {
                 Vector256<float> source = vectorsBase;
-                Vector256<float> multiply = Avx.Shuffle(source, source, ShuffleAlphaControl);
-                vectorsBase = Avx.Blend(Avx.Multiply(source, multiply), source, BlendAlphaControl);
+                Vector256<float> alpha = Avx.Permute(source, ShuffleAlphaControl);
+                vectorsBase = Avx.Blend(Avx.Multiply(source, alpha), source, BlendAlphaControl);
                 vectorsBase = ref Unsafe.Add(ref vectorsBase, 1);
             }
 
@@ -532,24 +521,49 @@ public static void Premultiply(Span<Vector4> vectors)
         }
     }
 
+    /// <summary>
+    /// Reverses the result of premultiplying a vector via <see cref="Premultiply(ref Vector4)"/>.
+    /// </summary>
+    /// <param name="source">The <see cref="Vector4"/> to premultiply</param>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static void UnPremultiply(ref Vector4 source)
+    {
+        Vector4 alpha = PermuteW(source);
+        UnPremultiply(ref source, alpha);
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static void UnPremultiply(ref Vector4 source, Vector4 alpha)
+    {
+        if (alpha == Vector4.Zero)
+        {
+            return;
+        }
+
+        // Divide source by alpha if alpha is nonzero, otherwise set all components to match the source value
+        // Blend the result with the alpha vector to ensure that the alpha component is unchanged
+        source = WithW(source / alpha, alpha);
+    }
+
     /// <summary>
     /// Bulk variant of <see cref="UnPremultiply(ref Vector4)"/>
     /// </summary>
     /// <param name="vectors">The span of vectors</param>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static void UnPremultiply(Span<Vector4> vectors)
     {
-        if (Avx2.IsSupported && vectors.Length >= 2)
+        if (Avx.IsSupported && vectors.Length >= 2)
         {
             // Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
             ref Vector256<float> vectorsBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(vectors));
             ref Vector256<float> vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u));
+            Vector256<float> epsilon = Vector256.Create(Constants.Epsilon);
 
             while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast))
             {
                 Vector256<float> source = vectorsBase;
-                Vector256<float> multiply = Avx.Shuffle(source, source, ShuffleAlphaControl);
-                vectorsBase = Avx.Blend(Avx.Divide(source, multiply), source, BlendAlphaControl);
+                Vector256<float> alpha = Avx.Permute(source, ShuffleAlphaControl);
+                vectorsBase = UnPremultiply(source, alpha);
                 vectorsBase = ref Unsafe.Add(ref vectorsBase, 1);
             }
 
@@ -573,6 +587,61 @@ public static void UnPremultiply(Span<Vector4> vectors)
         }
     }
 
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<float> UnPremultiply(Vector256<float> source, Vector256<float> alpha)
+    {
+        // Check if alpha is zero to avoid division by zero
+        Vector256<float> zeroMask = Avx.CompareEqual(alpha, Vector256<float>.Zero);
+
+        // Divide source by alpha if alpha is nonzero, otherwise set all components to match the source value
+        Vector256<float> result = Avx.BlendVariable(Avx.Divide(source, alpha), source, zeroMask);
+
+        // Blend the result with the alpha vector to ensure that the alpha component is unchanged
+        return Avx.Blend(result, alpha, BlendAlphaControl);
+    }
+
+    /// <summary>
+    /// Permutes the given vector return a new instance with all the values set to <see cref="Vector4.W"/>.
+    /// </summary>
+    /// <param name="value">The vector.</param>
+    /// <returns>The <see cref="Vector4"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector4 PermuteW(Vector4 value)
+    {
+        if (Sse.IsSupported)
+        {
+            return Sse.Shuffle(value.AsVector128(), value.AsVector128(), ShuffleAlphaControl).AsVector4();
+        }
+
+        return new(value.W);
+    }
+
+    /// <summary>
+    /// Sets the W component of the given vector <paramref name="value"/> to the given value from <paramref name="w"/>.
+    /// </summary>
+    /// <param name="value">The vector to set.</param>
+    /// <param name="w">The vector containing the W value.</param>
+    /// <returns>The <see cref="Vector4"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector4 WithW(Vector4 value, Vector4 w)
+    {
+        if (Sse41.IsSupported)
+        {
+            return Sse41.Insert(value.AsVector128(), w.AsVector128(), 0b11_11_0000).AsVector4();
+        }
+
+        if (Sse.IsSupported)
+        {
+            // Create tmp as <w[3], w[0], value[2], value[0]>
+            // Then return <value[0], value[1], tmp[2], tmp[0]> (which is <value[0], value[1], value[2], w[3]>)
+            Vector128<float> tmp = Sse.Shuffle(w.AsVector128(), value.AsVector128(), 0b00_10_00_11);
+            return Sse.Shuffle(value.AsVector128(), tmp, 0b00_10_01_00).AsVector4();
+        }
+
+        value.W = w.W;
+        return value;
+    }
+
     /// <summary>
     /// Calculates the cube pow of all the XYZ channels of the input vectors.
     /// </summary>
@@ -586,7 +655,7 @@ public static unsafe void CubePowOnXYZ(Span<Vector4> vectors)
         while (Unsafe.IsAddressLessThan(ref baseRef, ref endRef))
         {
             Vector4 v = baseRef;
-            float a = v.W;
+            Vector4 a = PermuteW(v);
 
             // Fast path for the default gamma exposure, which is 3. In this case we can skip
             // calling Math.Pow 3 times (one per component), as the method is an internal call and
@@ -595,7 +664,7 @@ public static unsafe void CubePowOnXYZ(Span<Vector4> vectors)
             // back to the target index in the temporary span. The whole iteration will get completely
             // inlined and traslated into vectorized instructions, with much better performance.
             v = v * v * v;
-            v.W = a;
+            v = WithW(v, a);
 
             baseRef = v;
             baseRef = ref Unsafe.Add(ref baseRef, 1);

diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs b/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs
@@ -47,10 +47,7 @@ internal interface IShuffle4 : IComponentShuffle
 internal readonly struct DefaultShuffle4 : IShuffle4
 {
     public DefaultShuffle4(byte control)
-    {
-        DebugGuard.MustBeBetweenOrEqualTo<byte>(control, 0, 3, nameof(control));
-        this.Control = control;
-    }
+        => this.Control = control;
 
     public byte Control { get; }
 

diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs
@@ -15,10 +15,7 @@ internal interface IPad3Shuffle4 : IComponentShuffle
 internal readonly struct DefaultPad3Shuffle4 : IPad3Shuffle4
 {
     public DefaultPad3Shuffle4(byte control)
-    {
-        DebugGuard.MustBeBetweenOrEqualTo<byte>(control, 0, 3, nameof(control));
-        this.Control = control;
-    }
+        => this.Control = control;
 
     public byte Control { get; }
 

diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs
@@ -15,10 +15,7 @@ internal interface IShuffle3 : IComponentShuffle
 internal readonly struct DefaultShuffle3 : IShuffle3
 {
     public DefaultShuffle3(byte control)
-    {
-        DebugGuard.MustBeBetweenOrEqualTo<byte>(control, 0, 3, nameof(control));
-        this.Control = control;
-    }
+        => this.Control = control;
 
     public byte Control { get; }
 

diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs
@@ -15,10 +15,7 @@ internal interface IShuffle4Slice3 : IComponentShuffle
 internal readonly struct DefaultShuffle4Slice3 : IShuffle4Slice3
 {
     public DefaultShuffle4Slice3(byte control)
-    {
-        DebugGuard.MustBeBetweenOrEqualTo<byte>(control, 0, 3, nameof(control));
-        this.Control = control;
-    }
+        => this.Control = control;
 
     public byte Control { get; }