Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable Avx2 optimizations on Porter-Duff operations. #2359

Merged
merged 25 commits into from
Feb 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
517ec80
Port most of the function components.
JimBobSquarePants Feb 17, 2023
6a4dcd7
Merge branch 'main' into js/avx2-porter-duff
JimBobSquarePants Feb 17, 2023
746b34d
Finish porting function components
JimBobSquarePants Feb 19, 2023
4c546d7
Update the PorterDuffFunctions.Generated.tt to include the Vector256<…
JimBobSquarePants Feb 19, 2023
ef34960
Fix code generation
JimBobSquarePants Feb 19, 2023
9f8bcc4
Respond to feedback
JimBobSquarePants Feb 19, 2023
5fedca8
Respond to feedback
JimBobSquarePants Feb 19, 2023
907400f
Use Permute
JimBobSquarePants Feb 19, 2023
9a552f1
Revert "Use Permute"
JimBobSquarePants Feb 19, 2023
bde9324
Use Permute
JimBobSquarePants Feb 19, 2023
41cfa9b
Port DefaultPixelBlenders
JimBobSquarePants Feb 19, 2023
b4ff1e4
Fix issues
JimBobSquarePants Feb 19, 2023
c58be60
Add additional PD tests
JimBobSquarePants Feb 19, 2023
dff381f
Fix amount span assignment
JimBobSquarePants Feb 19, 2023
6cb6bd4
Better clamp, fix offset (again)
JimBobSquarePants Feb 19, 2023
c06da8c
Add NormalSrcOver benchmark
JimBobSquarePants Feb 19, 2023
b05b25b
Use RemoteExecutor for composition tests
JimBobSquarePants Feb 19, 2023
916084c
Fix field assignment in benchmark
JimBobSquarePants Feb 19, 2023
8ffec30
Make Scalar default
JimBobSquarePants Feb 19, 2023
a666372
Use FMA where possible.
JimBobSquarePants Feb 19, 2023
afdc53c
Tanners Top Tips!!
JimBobSquarePants Feb 20, 2023
7309b6e
Merge branch 'main' into js/avx2-porter-duff
JimBobSquarePants Feb 20, 2023
78eb2f1
Use WithW
JimBobSquarePants Feb 20, 2023
ac0d27d
Provide Sse fallback for WithW
JimBobSquarePants Feb 20, 2023
9752566
Merge branch 'main' into js/avx2-porter-duff
JimBobSquarePants Feb 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/ImageSharp/Common/Constants.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.

namespace SixLabors.ImageSharp;
Expand Down
40 changes: 30 additions & 10 deletions src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -532,7 +532,8 @@ private static void Shuffle4Slice3(
}

/// <summary>
/// Performs a multiplication and an addition of the <see cref="Vector256{T}"/>.
/// Performs a multiplication and an addition of the <see cref="Vector256{Single}"/>.
/// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
/// </summary>
/// <remarks>ret = (vm0 * vm1) + va</remarks>
/// <param name="va">The vector to add to the intermediate result.</param>
Expand All @@ -549,22 +550,21 @@ public static Vector256<float> MultiplyAdd(
{
return Fma.MultiplyAdd(vm1, vm0, va);
}
else
{
return Avx.Add(Avx.Multiply(vm0, vm1), va);
}

return Avx.Add(Avx.Multiply(vm0, vm1), va);
}

/// <summary>
/// Performs a multiplication and a substraction of the <see cref="Vector256{T}"/>.
/// Performs a multiplication and a subtraction of the <see cref="Vector256{Single}"/>.
/// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
/// </summary>
/// <remarks>ret = (vm0 * vm1) - vs</remarks>
/// <param name="vs">The vector to substract from the intermediate result.</param>
/// <param name="vs">The vector to subtract from the intermediate result.</param>
/// <param name="vm0">The first vector to multiply.</param>
/// <param name="vm1">The second vector to multiply.</param>
/// <returns>The <see cref="Vector256{T}"/>.</returns>
[MethodImpl(InliningOptions.ShortMethod)]
public static Vector256<float> MultiplySubstract(
public static Vector256<float> MultiplySubtract(
in Vector256<float> vs,
in Vector256<float> vm0,
in Vector256<float> vm1)
Expand All @@ -573,10 +573,30 @@ public static Vector256<float> MultiplySubstract(
{
return Fma.MultiplySubtract(vm1, vm0, vs);
}
else

return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
}

/// <summary>
/// Performs a multiplication and a negated addition of the <see cref="Vector256{Single}"/>.
/// </summary>
/// <remarks>ret = c - (a * b)</remarks>
/// <param name="a">The first vector to multiply.</param>
/// <param name="b">The second vector to multiply.</param>
/// <param name="c">The vector to add negated to the intermediate result.</param>
/// <returns>The <see cref="Vector256{T}"/>.</returns>
[MethodImpl(InliningOptions.ShortMethod)]
public static Vector256<float> MultiplyAddNegated(
in Vector256<float> a,
in Vector256<float> b,
in Vector256<float> c)
Comment on lines +590 to +592
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Particularly since this is inlined, I wouldn't expect you to need in here (or on the other examples above).

It shouldn't actually make a difference since the JIT should inline things and elide the "address taken", but it's typically easier on the JIT if you don't mark them unnecessarily for "enregisterable" types.

Copy link
Contributor

@saucecontrol saucecontrol Feb 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC, without the in JIT could/would lose track of the fact memory load operands could be contained when inlining these small helpers, at least in netcoreapp3.1. Don't know if or when that was fixed, but I'd double check before removing them.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't looked at the JIT but am seeing a 2% slowdown when running my benchmark when I remove them. Could be error but I'll leave them for now.

{
if (Fma.IsSupported)
{
return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
return Fma.MultiplyAddNegated(a, b, c);
}

return Avx.Subtract(c, Avx.Multiply(a, b));
}

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ static void IDCT8x8_1D_Avx(ref Block8x8F block)

var mm256_F_1_4142 = Vector256.Create(1.414213562f);
Vector256<float> tmp13 = Avx.Add(tmp1, tmp3);
Vector256<float> tmp12 = SimdUtils.HwIntrinsics.MultiplySubstract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
Vector256<float> tmp12 = SimdUtils.HwIntrinsics.MultiplySubtract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);

tmp0 = Avx.Add(tmp10, tmp13);
tmp3 = Avx.Subtract(tmp10, tmp13);
Expand Down
8,112 changes: 7,676 additions & 436 deletions src/ImageSharp/PixelFormats/PixelBlenders/DefaultPixelBlenders.Generated.cs

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@

// <auto-generated />
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;

namespace SixLabors.ImageSharp.PixelFormats.PixelBlenders;

Expand Down Expand Up @@ -86,18 +90,85 @@ var blenders = new []{
protected override void BlendFunction(Span<Vector4> destination, ReadOnlySpan<Vector4> background, ReadOnlySpan<Vector4> source, float amount)
{
amount = Numerics.Clamp(amount, 0, 1);
for (int i = 0; i < destination.Length; i++)

if (Avx2.IsSupported && destination.Length >= 2)
{
destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], amount);
// Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
ref Vector256<float> destinationBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
ref Vector256<float> destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));

ref Vector256<float> backgroundBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(background));
ref Vector256<float> sourceBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(source));
Vector256<float> opacity = Vector256.Create(amount);

while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
{
destinationBase = PorterDuffFunctions.<#=blender_composer#>(backgroundBase, sourceBase, opacity);
destinationBase = ref Unsafe.Add(ref destinationBase, 1);
backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
sourceBase = ref Unsafe.Add(ref sourceBase, 1);
}

if (Numerics.Modulo2(destination.Length) != 0)
{
// Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
int i = destination.Length - 1;
destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], amount);
}
}
else
{
for (int i = 0; i < destination.Length; i++)
{
destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], amount);
}
}
}

/// <inheritdoc />
protected override void BlendFunction(Span<Vector4> destination, ReadOnlySpan<Vector4> background, ReadOnlySpan<Vector4> source, ReadOnlySpan<float> amount)
{
for (int i = 0; i < destination.Length; i++)
if (Avx2.IsSupported && destination.Length >= 2)
{
// Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
ref Vector256<float> destinationBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
ref Vector256<float> destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));

ref Vector256<float> backgroundBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(background));
ref Vector256<float> sourceBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(source));
ref float amountBase = ref MemoryMarshal.GetReference(amount);

Vector256<float> vOne = Vector256.Create(1F);

while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
{
// We need to create a Vector256<float> containing the current and next amount values
// taking up each half of the Vector256<float> and then clamp them.
Vector256<float> opacity = Vector256.Create(
Vector128.Create(amountBase),
Vector128.Create(Unsafe.Add(ref amountBase, 1)));
opacity = Avx.Min(Avx.Max(Vector256<float>.Zero, opacity), vOne);

destinationBase = PorterDuffFunctions.<#=blender_composer#>(backgroundBase, sourceBase, opacity);
destinationBase = ref Unsafe.Add(ref destinationBase, 1);
backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
sourceBase = ref Unsafe.Add(ref sourceBase, 1);
amountBase = ref Unsafe.Add(ref amountBase, 2);
}

if (Numerics.Modulo2(destination.Length) != 0)
{
// Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
int i = destination.Length - 1;
destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
}
}
else
{
destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], Numerics.Clamp(amount[i], 0, 1));
for (int i = 0; i < destination.Length; i++)
{
destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
}
}
}
}
Expand Down
Loading