Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Span.Reverse fast path performance #70944

Merged
merged 11 commits into from
Nov 18, 2022
109 changes: 77 additions & 32 deletions src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Buffers.Binary;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Numerics;
Expand Down Expand Up @@ -1129,21 +1130,23 @@ private static unsafe nuint UnalignedCountVector128(ref byte searchSpace)

public static void Reverse(ref byte buf, nuint length)
{
if (Avx2.IsSupported && (nuint)Vector256<byte>.Count * 2 <= length)
Debug.Assert(length > 1);

nint remainder = (nint)length;
nint offset = 0;

if (Avx2.IsSupported && remainder >= Vector256<byte>.Count)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The trick with taking the vectorized path for sizes smaller than 2*Vector256.Count is neat, but it hurts performance in number of cases. There can be a lot of redundant work done for certain sizes with this PR.

I have opened #78604 on this.

{
Vector256<byte> reverseMask = Vector256.Create(
(byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, // first 128-bit lane
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); // second 128-bit lane
nuint numElements = (nuint)Vector256<byte>.Count;
nuint numIters = (length / numElements) / 2;
for (nuint i = 0; i < numIters; i++)
{
nuint firstOffset = i * numElements;
nuint lastOffset = length - ((1 + i) * numElements);

// Load in values from beginning and end of the array.
Vector256<byte> tempFirst = Vector256.LoadUnsafe(ref buf, firstOffset);
Vector256<byte> tempLast = Vector256.LoadUnsafe(ref buf, lastOffset);
nint lastOffset = remainder - Vector256<byte>.Count;
do
{
// Load the values into vectors
Vector256<byte> tempFirst = Vector256.LoadUnsafe(ref buf, (nuint)offset);
Vector256<byte> tempLast = Vector256.LoadUnsafe(ref buf, (nuint)lastOffset);

// Avx2 operates on two 128-bit lanes rather than the full 256-bit vector.
// Perform a shuffle to reverse each 128-bit lane, then permute to finish reversing the vector:
Expand All @@ -1170,24 +1173,23 @@ public static void Reverse(ref byte buf, nuint length)
tempLast = Avx2.Permute2x128(tempLast, tempLast, 0b00_01);

// Store the reversed vectors
tempLast.StoreUnsafe(ref buf, firstOffset);
tempFirst.StoreUnsafe(ref buf, lastOffset);
}
buf = ref Unsafe.Add(ref buf, numIters * numElements);
length -= numIters * numElements * 2;
tempLast.StoreUnsafe(ref buf, (nuint)offset);
tempFirst.StoreUnsafe(ref buf, (nuint)lastOffset);

offset += Vector256<byte>.Count;
lastOffset -= Vector256<byte>.Count;
} while (lastOffset >= offset);

remainder = lastOffset + Vector256<byte>.Count - offset;
}
else if (Vector128.IsHardwareAccelerated && (nuint)Vector128<byte>.Count * 2 <= length)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the removal of Vector128 code path causes a 10% regression for larger inputs on arm64. This needs to be addressed. I can run the benchmarks for you on an arm64 machine, just ping me here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed it back to the previous behavior, can you run the benchmarks again when you got time, please?

else if (Vector128.IsHardwareAccelerated && remainder >= Vector128<byte>.Count)
{
nuint numElements = (nuint)Vector128<byte>.Count;
nuint numIters = (length / numElements) / 2;
for (nuint i = 0; i < numIters; i++)
nint lastOffset = remainder - Vector128<byte>.Count;
do
{
nuint firstOffset = i * numElements;
nuint lastOffset = length - ((1 + i) * numElements);

// Load in values from beginning and end of the array.
Vector128<byte> tempFirst = Vector128.LoadUnsafe(ref buf, firstOffset);
Vector128<byte> tempLast = Vector128.LoadUnsafe(ref buf, lastOffset);
// Load the values into vectors
Vector128<byte> tempFirst = Vector128.LoadUnsafe(ref buf, (nuint)offset);
Vector128<byte> tempLast = Vector128.LoadUnsafe(ref buf, (nuint)lastOffset);

// Shuffle to reverse each vector:
// +---------------------------------------------------------------+
Expand All @@ -1203,15 +1205,58 @@ public static void Reverse(ref byte buf, nuint length)
(byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));

// Store the reversed vectors
tempLast.StoreUnsafe(ref buf, firstOffset);
tempFirst.StoreUnsafe(ref buf, lastOffset);
}
buf = ref Unsafe.Add(ref buf, numIters * numElements);
length -= numIters * numElements * 2;
tempLast.StoreUnsafe(ref buf, (nuint)offset);
tempFirst.StoreUnsafe(ref buf, (nuint)lastOffset);

offset += Vector128<byte>.Count;
lastOffset -= Vector128<byte>.Count;
} while (lastOffset >= offset);

remainder = lastOffset + Vector128<byte>.Count - offset;
}

if (remainder >= sizeof(long))
{
nint lastOffset = (nint)length - offset - sizeof(long);
do
{
long tempFirst = Unsafe.ReadUnaligned<long>(ref Unsafe.Add(ref buf, offset));
long tempLast = Unsafe.ReadUnaligned<long>(ref Unsafe.Add(ref buf, lastOffset));

// swap and store in reversed position
Unsafe.WriteUnaligned(ref Unsafe.Add(ref buf, offset), BinaryPrimitives.ReverseEndianness(tempLast));
Unsafe.WriteUnaligned(ref Unsafe.Add(ref buf, lastOffset), BinaryPrimitives.ReverseEndianness(tempFirst));

offset += sizeof(long);
lastOffset -= sizeof(long);
} while (lastOffset >= offset);

remainder = lastOffset + sizeof(long) - offset;
}

// Store any remaining values one-by-one
ReverseInner(ref buf, length);
if (remainder >= sizeof(int))
{
nint lastOffset = (nint)length - offset - sizeof(int);
do
{
int tempFirst = Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref buf, offset));
int tempLast = Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref buf, lastOffset));

// swap and store in reversed position
Unsafe.WriteUnaligned(ref Unsafe.Add(ref buf, offset), BinaryPrimitives.ReverseEndianness(tempLast));
Unsafe.WriteUnaligned(ref Unsafe.Add(ref buf, lastOffset), BinaryPrimitives.ReverseEndianness(tempFirst));

offset += sizeof(int);
lastOffset -= sizeof(int);
} while (lastOffset >= offset);

remainder = lastOffset + sizeof(int) - offset;
}

if (remainder > 1)
{
ReverseInner(ref Unsafe.Add(ref buf, offset), (nuint)remainder);
}
}
}
}
82 changes: 44 additions & 38 deletions src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs
Original file line number Diff line number Diff line change
Expand Up @@ -733,23 +733,25 @@ private static unsafe nint UnalignedCountVector128(ref char searchSpace)

public static void Reverse(ref char buf, nuint length)
{
if (Avx2.IsSupported && (nuint)Vector256<short>.Count * 2 <= length)
Debug.Assert(length > 1);

nint remainder = (nint)length;
nint offset = 0;

if (Avx2.IsSupported && remainder >= Vector256<ushort>.Count)
{
ref byte bufByte = ref Unsafe.As<char, byte>(ref buf);
nuint byteLength = length * sizeof(char);
Vector256<byte> reverseMask = Vector256.Create(
(byte)14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, // first 128-bit lane
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); // second 128-bit lane
nuint numElements = (nuint)Vector256<byte>.Count;
nuint numIters = (byteLength / numElements) / 2;
for (nuint i = 0; i < numIters; i++)

nint lastOffset = remainder - Vector256<ushort>.Count;
do
{
nuint firstOffset = i * numElements;
nuint lastOffset = byteLength - ((1 + i) * numElements);
ref byte first = ref Unsafe.As<char, byte>(ref Unsafe.Add(ref buf, offset));
ref byte last = ref Unsafe.As<char, byte>(ref Unsafe.Add(ref buf, lastOffset));

// Load in values from beginning and end of the array.
Vector256<byte> tempFirst = Vector256.LoadUnsafe(ref bufByte, firstOffset);
Vector256<byte> tempLast = Vector256.LoadUnsafe(ref bufByte, lastOffset);
Vector256<byte> tempFirst = Vector256.LoadUnsafe(ref first);
Vector256<byte> tempLast = Vector256.LoadUnsafe(ref last);

// Avx2 operates on two 128-bit lanes rather than the full 256-bit vector.
// Perform a shuffle to reverse each 128-bit lane, then permute to finish reversing the vector:
Expand All @@ -770,27 +772,25 @@ public static void Reverse(ref char buf, nuint length)
tempLast = Avx2.Permute2x128(tempLast, tempLast, 0b00_01);

// Store the reversed vectors
tempLast.StoreUnsafe(ref bufByte, firstOffset);
tempFirst.StoreUnsafe(ref bufByte, lastOffset);
}
bufByte = ref Unsafe.Add(ref bufByte, numIters * numElements);
length -= numIters * (nuint)Vector256<short>.Count * 2;
// Store any remaining values one-by-one
buf = ref Unsafe.As<byte, char>(ref bufByte);
tempLast.StoreUnsafe(ref first);
tempFirst.StoreUnsafe(ref last);

offset += Vector256<ushort>.Count;
lastOffset -= Vector256<ushort>.Count;
} while (lastOffset >= offset);

remainder = (lastOffset + Vector256<ushort>.Count - offset);
}
else if (Vector128.IsHardwareAccelerated && (nuint)Vector128<short>.Count * 2 <= length)
else if (Vector128.IsHardwareAccelerated && remainder >= Vector128<ushort>.Count)
{
ref short bufShort = ref Unsafe.As<char, short>(ref buf);
nuint numElements = (nuint)Vector128<short>.Count;
nuint numIters = (length / numElements) / 2;
for (nuint i = 0; i < numIters; i++)
nint lastOffset = remainder - Vector128<ushort>.Count;
do
{
nuint firstOffset = i * numElements;
nuint lastOffset = length - ((1 + i) * numElements);
ref ushort first = ref Unsafe.As<char, ushort>(ref Unsafe.Add(ref buf, offset));
ref ushort last = ref Unsafe.As<char, ushort>(ref Unsafe.Add(ref buf, lastOffset));

// Load in values from beginning and end of the array.
Vector128<short> tempFirst = Vector128.LoadUnsafe(ref bufShort, firstOffset);
Vector128<short> tempLast = Vector128.LoadUnsafe(ref bufShort, lastOffset);
Vector128<ushort> tempFirst = Vector128.LoadUnsafe(ref first);
Vector128<ushort> tempLast = Vector128.LoadUnsafe(ref last);

// Shuffle to reverse each vector:
// +-------------------------------+
Expand All @@ -800,19 +800,25 @@ public static void Reverse(ref char buf, nuint length)
// +-------------------------------+
// | H | G | F | E | D | C | B | A |
// +-------------------------------+
tempFirst = Vector128.Shuffle(tempFirst, Vector128.Create(7, 6, 5, 4, 3, 2, 1, 0));
tempLast = Vector128.Shuffle(tempLast, Vector128.Create(7, 6, 5, 4, 3, 2, 1, 0));
tempFirst = Vector128.Shuffle(tempFirst, Vector128.Create((ushort)7, 6, 5, 4, 3, 2, 1, 0));
tempLast = Vector128.Shuffle(tempLast, Vector128.Create((ushort)7, 6, 5, 4, 3, 2, 1, 0));

// Store the reversed vectors
tempLast.StoreUnsafe(ref bufShort, firstOffset);
tempFirst.StoreUnsafe(ref bufShort, lastOffset);
}
bufShort = ref Unsafe.Add(ref bufShort, numIters * numElements);
length -= numIters * (nuint)Vector128<short>.Count * 2;
// Store any remaining values one-by-one
buf = ref Unsafe.As<short, char>(ref bufShort);
tempLast.StoreUnsafe(ref first);
tempFirst.StoreUnsafe(ref last);

offset += Vector128<ushort>.Count;
lastOffset -= Vector128<ushort>.Count;
} while (lastOffset >= offset);

remainder = (lastOffset + Vector128<ushort>.Count - offset);
}

// Store any remaining values one-by-one
if (remainder > 1)
{
ReverseInner(ref Unsafe.Add(ref buf, offset), (nuint)remainder);
}
ReverseInner(ref buf, length);
}
}
}
Loading