Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to iSimdVector and Align WidenAsciiToUtf16 #99982

Merged
merged 5 commits into from
Apr 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,19 @@ static virtual bool TryCopyTo(TSelf vector, Span<T> destination)
// New Surface Area
//

/// <summary>Checks if any of the vector lanes are equivalent to value.</summary>
/// <param name="vector">The Vector.</param>
/// <param name="value">The Value to check.</param>
/// <returns><c>true</c> if <paramref name="vector" /> has any lanes equivalent to <paramref name="value" /> otherwise, <c>false</c> if none of the lanes are equivalent to <paramref name="value" /> />.</returns>
/// <exception cref="NotSupportedException">The type of the elements in the vector (<typeparamref name="T" />) is not supported.</exception>
static abstract bool Any(TSelf vector, T value);

/// <summary>Checks if any of the vector lanes have All Bits set.</summary>
/// <param name="vector">The Vector to check.</param>
/// <returns><c>true</c> if <paramref name="vector" /> has any lanes with All Bits set otherwise, <c>false</c> if none of the lanes have All Bits set />.</returns>
/// <exception cref="NotSupportedException">The type of the elements in the vector (<typeparamref name="T" />) is not supported.</exception>
static abstract bool AnyWhereAllBitsSet(TSelf vector);

static abstract int IndexOfLastMatch(TSelf vector);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,16 @@ private string ToString([StringSyntax(StringSyntaxAttribute.NumericFormat)] stri
// New Surface Area
//

static bool ISimdVector<Vector128<T>, T>.AnyWhereAllBitsSet(Vector128<T> vector)
{
return (Vector128.EqualsAny(vector, Vector128<T>.AllBitsSet));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: unnecessary parens here and in Any

If you could fix that in a follow up PR, that'd be great (going to merge this)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good! Will put it up later today.

}

static bool ISimdVector<Vector128<T>, T>.Any(Vector128<T> vector, T value)
{
return (Vector128.EqualsAny(vector, Vector128.Create((T)value)));
}

static int ISimdVector<Vector128<T>, T>.IndexOfLastMatch(Vector128<T> vector)
{
uint mask = vector.ExtractMostSignificantBits();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,16 @@ private string ToString([StringSyntax(StringSyntaxAttribute.NumericFormat)] stri
// New Surface Area
//

static bool ISimdVector<Vector256<T>, T>.AnyWhereAllBitsSet(Vector256<T> vector)
{
return (Vector256.EqualsAny(vector, Vector256<T>.AllBitsSet));
}

static bool ISimdVector<Vector256<T>, T>.Any(Vector256<T> vector, T value)
{
return (Vector256.EqualsAny(vector, Vector256.Create((T)value)));
}

static int ISimdVector<Vector256<T>, T>.IndexOfLastMatch(Vector256<T> vector)
{
uint mask = vector.ExtractMostSignificantBits();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,16 @@ private string ToString([StringSyntax(StringSyntaxAttribute.NumericFormat)] stri
// New Surface Area
//

static bool ISimdVector<Vector512<T>, T>.AnyWhereAllBitsSet(Vector512<T> vector)
{
return (Vector512.EqualsAny(vector, Vector512<T>.AllBitsSet));
}

static bool ISimdVector<Vector512<T>, T>.Any(Vector512<T> vector, T value)
{
return (Vector512.EqualsAny(vector, Vector512.Create((T)value)));
}

static int ISimdVector<Vector512<T>, T>.IndexOfLastMatch(Vector512<T> vector)
{
ulong mask = vector.ExtractMostSignificantBits();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -757,6 +757,16 @@ private string ToString([StringSyntax(StringSyntaxAttribute.NumericFormat)] stri
// New Surface Area
//

static bool ISimdVector<Vector64<T>, T>.AnyWhereAllBitsSet(Vector64<T> vector)
{
return (Vector64.EqualsAny(vector, Vector64<T>.AllBitsSet));
}

static bool ISimdVector<Vector64<T>, T>.Any(Vector64<T> vector, T value)
{
return (Vector64.EqualsAny(vector, Vector64.Create((T)value)));
}

static int ISimdVector<Vector64<T>, T>.IndexOfLastMatch(Vector64<T> vector)
{
uint mask = vector.ExtractMostSignificantBits();
Expand Down
153 changes: 85 additions & 68 deletions src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2038,79 +2038,17 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B

if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= (uint)Vector128<byte>.Count)
{
ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer;

if (Vector512.IsHardwareAccelerated && elementCount >= (uint)Vector512<byte>.Count)
if (Vector512.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector512<byte>.Count)
{
// Calculating the destination address outside the loop results in significant
// perf wins vs. relying on the JIT to fold memory addressing logic into the
// write instructions. See: https://github.com/dotnet/runtime/issues/33002
nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector512<byte>.Count;

do
{
Vector512<byte> asciiVector = Vector512.Load(pAsciiBuffer + currentOffset);

if (asciiVector.ExtractMostSignificantBits() != 0)
{
break;
}

(Vector512<ushort> utf16LowVector, Vector512<ushort> utf16HighVector) = Vector512.Widen(asciiVector);
utf16LowVector.Store(pCurrentWriteAddress);
utf16HighVector.Store(pCurrentWriteAddress + Vector512<ushort>.Count);

currentOffset += (nuint)Vector512<byte>.Count;
pCurrentWriteAddress += (nuint)Vector512<byte>.Count;
} while (currentOffset <= finalOffsetWhereCanRunLoop);
WidenAsciiToUtf1_Vector<Vector512<byte>, Vector512<ushort>>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount);
}
else if (Vector256.IsHardwareAccelerated && elementCount >= (uint)Vector256<byte>.Count)
else if (Vector256.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector256<byte>.Count)
{
// Calculating the destination address outside the loop results in significant
// perf wins vs. relying on the JIT to fold memory addressing logic into the
// write instructions. See: https://github.com/dotnet/runtime/issues/33002
nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector256<byte>.Count;

do
{
Vector256<byte> asciiVector = Vector256.Load(pAsciiBuffer + currentOffset);

if (asciiVector.ExtractMostSignificantBits() != 0)
{
break;
}

(Vector256<ushort> utf16LowVector, Vector256<ushort> utf16HighVector) = Vector256.Widen(asciiVector);
utf16LowVector.Store(pCurrentWriteAddress);
utf16HighVector.Store(pCurrentWriteAddress + Vector256<ushort>.Count);

currentOffset += (nuint)Vector256<byte>.Count;
pCurrentWriteAddress += (nuint)Vector256<byte>.Count;
} while (currentOffset <= finalOffsetWhereCanRunLoop);
WidenAsciiToUtf1_Vector<Vector256<byte>, Vector256<ushort>>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount);
}
else
else if (Vector128.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector128<byte>.Count)
{
// Calculating the destination address outside the loop results in significant
// perf wins vs. relying on the JIT to fold memory addressing logic into the
// write instructions. See: https://github.com/dotnet/runtime/issues/33002
nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector128<byte>.Count;

do
{
Vector128<byte> asciiVector = Vector128.Load(pAsciiBuffer + currentOffset);

if (VectorContainsNonAsciiChar(asciiVector))
{
break;
}

(Vector128<ushort> utf16LowVector, Vector128<ushort> utf16HighVector) = Vector128.Widen(asciiVector);
utf16LowVector.Store(pCurrentWriteAddress);
utf16HighVector.Store(pCurrentWriteAddress + Vector128<ushort>.Count);

currentOffset += (nuint)Vector128<byte>.Count;
pCurrentWriteAddress += (nuint)Vector128<byte>.Count;
} while (currentOffset <= finalOffsetWhereCanRunLoop);
WidenAsciiToUtf1_Vector<Vector128<byte>, Vector128<ushort>>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount);
}
}

Expand Down Expand Up @@ -2212,6 +2150,85 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B
goto Finish;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe void WidenAsciiToUtf1_Vector<TVectorByte, TVectorUInt16>(byte* pAsciiBuffer, char* pUtf16Buffer, ref nuint currentOffset, nuint elementCount)
where TVectorByte : unmanaged, ISimdVector<TVectorByte, byte>
where TVectorUInt16 : unmanaged, ISimdVector<TVectorUInt16, ushort>
{
ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer;
// Calculating the destination address outside the loop results in significant
// perf wins vs. relying on the JIT to fold memory addressing logic into the
// write instructions. See: https://github.com/dotnet/runtime/issues/33002
nuint finalOffsetWhereCanRunLoop = elementCount - (nuint)TVectorByte.Count;
TVectorByte asciiVector = TVectorByte.Load(pAsciiBuffer + currentOffset);
if (!HasMatch<TVectorByte>(asciiVector))
{
(TVectorUInt16 utf16LowVector, TVectorUInt16 utf16HighVector) = Widen<TVectorByte, TVectorUInt16>(asciiVector);
utf16LowVector.Store(pCurrentWriteAddress);
utf16HighVector.Store(pCurrentWriteAddress + TVectorUInt16.Count);
pCurrentWriteAddress += (nuint)(TVectorUInt16.Count * 2);
if (((nuint)pCurrentWriteAddress % sizeof(char)) == 0)
{
// Bump write buffer up to the next aligned boundary
pCurrentWriteAddress = (ushort*)((nuint)pCurrentWriteAddress & ~(nuint)(TVectorUInt16.Alignment - 1));
nuint numBytesWritten = (nuint)pCurrentWriteAddress - (nuint)pUtf16Buffer;
currentOffset += (nuint)numBytesWritten / 2;
}
else
{
// If input isn't char aligned, we won't be able to align it to a Vector
currentOffset += (nuint)TVectorByte.Count;
}
while (currentOffset <= finalOffsetWhereCanRunLoop)
{
asciiVector = TVectorByte.Load(pAsciiBuffer + currentOffset);
if (HasMatch<TVectorByte>(asciiVector))
{
break;
}
(utf16LowVector, utf16HighVector) = Widen<TVectorByte, TVectorUInt16>(asciiVector);
utf16LowVector.Store(pCurrentWriteAddress);
utf16HighVector.Store(pCurrentWriteAddress + TVectorUInt16.Count);

currentOffset += (nuint)TVectorByte.Count;
pCurrentWriteAddress += (nuint)(TVectorUInt16.Count * 2);
}
}
return;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe bool HasMatch<TVectorByte>(TVectorByte vector)
where TVectorByte : unmanaged, ISimdVector<TVectorByte, byte>
{
return !(vector & TVectorByte.Create((byte)0x80)).Equals(TVectorByte.Zero);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not (vector & TVectorByte.Create((byte)0x80)) != TVectorByte.Zero?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I basically ran into a weird issue where perf degraded significantly for specific cases on my ICX when I had '(vector & TVectorByte.Create((byte)0x80)) != TVectorByte.Zero' and the issue went away with what I have now. It was pretty consistent. I had some trouble narrowing down the exact why with VTune though.

I decided to go with the performant version for now

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was there a codegen difference between them? I'd expect them to generate the same code

Copy link
Contributor Author

@DeepakRajendrakumaran DeepakRajendrakumaran Mar 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did a quick check on godbolt and they all look the same - https://godbolt.org/z/P87dPdTGa

Now I'm curious if it was just something off when I ran it locally

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did some more digging and something is off. Tried 3 versions with a handwritten benchmark
image
Benchmark
image

Match3 is significantly faster than Match1 and Match 2

VTune for Match1 vs Match3
image

The inlining makes it hard to narrow down

image

Copy link
Contributor Author

@DeepakRajendrakumaran DeepakRajendrakumaran Mar 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have a strong preference for any of these patterns?

I can look into the why this is happening if it's important. For now, I'm just keeping the fast version

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tannergooding I have created an issue detailing this as discussed: #100493

Please let me know if there is anything else needed for this PR

}


[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe (TVectorUInt16 Lower, TVectorUInt16 Upper) Widen<TVectorByte, TVectorUInt16>(TVectorByte vector)
where TVectorByte : unmanaged, ISimdVector<TVectorByte, byte>
where TVectorUInt16 : unmanaged, ISimdVector<TVectorUInt16, ushort>
{
if (typeof(TVectorByte) == typeof(Vector256<byte>))
{
(Vector256<ushort> Lower256, Vector256<ushort> Upper256) = Vector256.Widen((Vector256<byte>)(object)vector);
return ((TVectorUInt16)(object)Lower256, (TVectorUInt16)(object)Upper256);
}
else if (typeof(TVectorByte) == typeof(Vector512<byte>))
{
(Vector512<ushort> Lower512, Vector512<ushort> Upper512) = Vector512.Widen((Vector512<byte>)(object)vector);
return ((TVectorUInt16)(object)Lower512, (TVectorUInt16)(object)Upper512);
}
else
{
Debug.Assert(typeof(TVectorByte) == typeof(Vector128<byte>));
(Vector128<ushort> Lower128, Vector128<ushort> Upper128) = Vector128.Widen((Vector128<byte>)(object)vector);
return ((TVectorUInt16)(object)Lower128, (TVectorUInt16)(object)Upper128);
}
}


/// <summary>
/// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
/// writes them to the output buffer with machine endianness.
Expand Down
Loading