Skip to content

Commit

Permalink
Use inline Vector{128|256}.Create for constants (#54827)
Browse files Browse the repository at this point in the history
* Used VectorXYZ.Create for constants in Base64

* Used VectorXYZ.Create for constants in BitArray

* Remove conditional compilation

It's only built for NetCoreAppCurrent, so no need to special case older runtimes.
  • Loading branch information
gfoidl committed Jul 12, 2021
1 parent 9fb28c8 commit 738b09b
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 177 deletions.
45 changes: 26 additions & 19 deletions src/libraries/System.Collections/src/System/Collections/BitArray.cs
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,6 @@ public BitArray(byte[] bytes)
_version = 0;
}

private static readonly Vector128<byte> s_bitMask128 = BitConverter.IsLittleEndian ?
Vector128.Create(0x80402010_08040201).AsByte() :
Vector128.Create(0x01020408_10204080).AsByte();

private const uint Vector128ByteCount = 16;
private const uint Vector128IntCount = 4;
private const uint Vector256ByteCount = 32;
Expand Down Expand Up @@ -190,6 +186,10 @@ public unsafe BitArray(bool[] values)
// However comparison against zero can be replaced to cmeq against zero (vceqzq_s8)
// See dotnet/runtime#33972 for details
Vector128<byte> zero = Vector128<byte>.Zero;
Vector128<byte> bitMask128 = BitConverter.IsLittleEndian ?
Vector128.Create(0x80402010_08040201).AsByte() :
Vector128.Create(0x01020408_10204080).AsByte();

fixed (bool* ptr = values)
{
for (; (i + Vector128ByteCount * 2u) <= (uint)values.Length; i += Vector128ByteCount * 2u)
Expand All @@ -199,15 +199,15 @@ public unsafe BitArray(bool[] values)
// and combine by ORing all of them together (In this case, adding all of them does the same thing)
Vector128<byte> lowerVector = AdvSimd.LoadVector128((byte*)ptr + i);
Vector128<byte> lowerIsFalse = AdvSimd.CompareEqual(lowerVector, zero);
Vector128<byte> bitsExtracted1 = AdvSimd.And(lowerIsFalse, s_bitMask128);
Vector128<byte> bitsExtracted1 = AdvSimd.And(lowerIsFalse, bitMask128);
bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1);
bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1);
bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1);
Vector128<short> lowerPackedIsFalse = bitsExtracted1.AsInt16();

Vector128<byte> upperVector = AdvSimd.LoadVector128((byte*)ptr + i + Vector128<byte>.Count);
Vector128<byte> upperIsFalse = AdvSimd.CompareEqual(upperVector, zero);
Vector128<byte> bitsExtracted2 = AdvSimd.And(upperIsFalse, s_bitMask128);
Vector128<byte> bitsExtracted2 = AdvSimd.And(upperIsFalse, bitMask128);
bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2);
bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2);
bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2);
Expand Down Expand Up @@ -857,12 +857,6 @@ public int Length
}
}

// The mask used when shuffling a single int into Vector128/256.
// On little endian machines, the lower 8 bits of int belong in the first byte, next lower 8 in the second and so on.
// We place the bytes that contain the bits to its respective byte so that we can mask out only the relevant bits later.
private static readonly Vector128<byte> s_lowerShuffleMask_CopyToBoolArray = Vector128.Create(0, 0x01010101_01010101).AsByte();
private static readonly Vector128<byte> s_upperShuffleMask_CopyToBoolArray = Vector128.Create(0x02020202_02020202, 0x03030303_03030303).AsByte();

public unsafe void CopyTo(Array array, int index)
{
if (array == null)
Expand Down Expand Up @@ -953,9 +947,15 @@ public unsafe void CopyTo(Array array, int index)
if (m_length < BitsPerInt32)
goto LessThan32;

// The mask used when shuffling a single int into Vector128/256.
// On little endian machines, the lower 8 bits of int belong in the first byte, next lower 8 in the second and so on.
// We place the bytes that contain the bits to its respective byte so that we can mask out only the relevant bits later.
Vector128<byte> lowerShuffleMask_CopyToBoolArray = Vector128.Create(0, 0x01010101_01010101).AsByte();
Vector128<byte> upperShuffleMask_CopyToBoolArray = Vector128.Create(0x02020202_02020202, 0x03030303_03030303).AsByte();

if (Avx2.IsSupported)
{
Vector256<byte> shuffleMask = Vector256.Create(s_lowerShuffleMask_CopyToBoolArray, s_upperShuffleMask_CopyToBoolArray);
Vector256<byte> shuffleMask = Vector256.Create(lowerShuffleMask_CopyToBoolArray, upperShuffleMask_CopyToBoolArray);
Vector256<byte> bitMask = Vector256.Create(0x80402010_08040201).AsByte();
Vector256<byte> ones = Vector256.Create((byte)1);

Expand All @@ -977,9 +977,12 @@ public unsafe void CopyTo(Array array, int index)
}
else if (Ssse3.IsSupported)
{
Vector128<byte> lowerShuffleMask = s_lowerShuffleMask_CopyToBoolArray;
Vector128<byte> upperShuffleMask = s_upperShuffleMask_CopyToBoolArray;
Vector128<byte> lowerShuffleMask = lowerShuffleMask_CopyToBoolArray;
Vector128<byte> upperShuffleMask = upperShuffleMask_CopyToBoolArray;
Vector128<byte> ones = Vector128.Create((byte)1);
Vector128<byte> bitMask128 = BitConverter.IsLittleEndian ?
Vector128.Create(0x80402010_08040201).AsByte() :
Vector128.Create(0x01020408_10204080).AsByte();

fixed (bool* destination = &boolArray[index])
{
Expand All @@ -989,12 +992,12 @@ public unsafe void CopyTo(Array array, int index)
Vector128<int> scalar = Vector128.CreateScalarUnsafe(bits);

Vector128<byte> shuffledLower = Ssse3.Shuffle(scalar.AsByte(), lowerShuffleMask);
Vector128<byte> extractedLower = Sse2.And(shuffledLower, s_bitMask128);
Vector128<byte> extractedLower = Sse2.And(shuffledLower, bitMask128);
Vector128<byte> normalizedLower = Sse2.Min(extractedLower, ones);
Sse2.Store((byte*)destination + i, normalizedLower);

Vector128<byte> shuffledHigher = Ssse3.Shuffle(scalar.AsByte(), upperShuffleMask);
Vector128<byte> extractedHigher = Sse2.And(shuffledHigher, s_bitMask128);
Vector128<byte> extractedHigher = Sse2.And(shuffledHigher, bitMask128);
Vector128<byte> normalizedHigher = Sse2.Min(extractedHigher, ones);
Sse2.Store((byte*)destination + i + Vector128<byte>.Count, normalizedHigher);
}
Expand All @@ -1003,6 +1006,10 @@ public unsafe void CopyTo(Array array, int index)
else if (AdvSimd.IsSupported)
{
Vector128<byte> ones = Vector128.Create((byte)1);
Vector128<byte> bitMask128 = BitConverter.IsLittleEndian ?
Vector128.Create(0x80402010_08040201).AsByte() :
Vector128.Create(0x01020408_10204080).AsByte();

fixed (bool* destination = &boolArray[index])
{
for (; (i + Vector128ByteCount * 2u) <= (uint)m_length; i += Vector128ByteCount * 2u)
Expand All @@ -1028,12 +1035,12 @@ public unsafe void CopyTo(Array array, int index)
vector = AdvSimd.Arm64.ZipLow(vector, vector);

Vector128<byte> shuffledLower = AdvSimd.Arm64.ZipLow(vector, vector);
Vector128<byte> extractedLower = AdvSimd.And(shuffledLower, s_bitMask128);
Vector128<byte> extractedLower = AdvSimd.And(shuffledLower, bitMask128);
Vector128<byte> normalizedLower = AdvSimd.Min(extractedLower, ones);
AdvSimd.Store((byte*)destination + i, normalizedLower);

Vector128<byte> shuffledHigher = AdvSimd.Arm64.ZipHigh(vector, vector);
Vector128<byte> extractedHigher = AdvSimd.And(shuffledHigher, s_bitMask128);
Vector128<byte> extractedHigher = AdvSimd.And(shuffledHigher, bitMask128);
Vector128<byte> normalizedHigher = AdvSimd.Min(extractedHigher, ones);
AdvSimd.Store((byte*)destination + i + Vector128<byte>.Count, normalizedHigher);
}
Expand Down
8 changes: 0 additions & 8 deletions src/libraries/System.Memory/src/System/Buffers/Text/Base64.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,12 @@
// The .NET Foundation licenses this file to you under the MIT license.

using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using Internal.Runtime.CompilerServices;

namespace System.Buffers.Text
{
public static partial class Base64
{
private static TVector ReadVector<TVector>(ReadOnlySpan<sbyte> data)
{
ref sbyte tmp = ref MemoryMarshal.GetReference(data);
return Unsafe.As<sbyte, TVector>(ref tmp);
}

[Conditional("DEBUG")]
private static unsafe void AssertRead<TVector>(byte* src, byte* srcStart, int srcLength)
{
Expand Down
172 changes: 77 additions & 95 deletions src/libraries/System.Memory/src/System/Buffers/Text/Base64Decoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> utf8, Spa
maxSrcLength = (destLength / 3) * 4;
}

ref sbyte decodingMap = ref MemoryMarshal.GetReference(s_decodingMap);
ref sbyte decodingMap = ref MemoryMarshal.GetReference(DecodingMap);
srcMax = srcBytes + (uint)maxSrcLength;

while (src < srcMax)
Expand Down Expand Up @@ -275,7 +275,7 @@ public static unsafe OperationStatus DecodeFromUtf8InPlace(Span<byte> buffer, ou
if (bufferLength == 0)
goto DoneExit;

ref sbyte decodingMap = ref MemoryMarshal.GetReference(s_decodingMap);
ref sbyte decodingMap = ref MemoryMarshal.GetReference(DecodingMap);

while (sourceIndex < bufferLength - 4)
{
Expand Down Expand Up @@ -362,14 +362,59 @@ private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, b
// See SSSE3-version below for an explanation of how the code works.

// The JIT won't hoist these "constants", so help it
Vector256<sbyte> lutHi = ReadVector<Vector256<sbyte>>(s_avxDecodeLutHi);
Vector256<sbyte> lutLo = ReadVector<Vector256<sbyte>>(s_avxDecodeLutLo);
Vector256<sbyte> lutShift = ReadVector<Vector256<sbyte>>(s_avxDecodeLutShift);
Vector256<sbyte> lutHi = Vector256.Create(
0x10, 0x10, 0x01, 0x02,
0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x01, 0x02,
0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x10, 0x10);

Vector256<sbyte> lutLo = Vector256.Create(
0x15, 0x11, 0x11, 0x11,
0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A,
0x1B, 0x1B, 0x1B, 0x1A,
0x15, 0x11, 0x11, 0x11,
0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A,
0x1B, 0x1B, 0x1B, 0x1A);

Vector256<sbyte> lutShift = Vector256.Create(
0, 16, 19, 4,
-65, -65, -71, -71,
0, 0, 0, 0,
0, 0, 0, 0,
0, 16, 19, 4,
-65, -65, -71, -71,
0, 0, 0, 0,
0, 0, 0, 0);

Vector256<sbyte> packBytesInLaneMask = Vector256.Create(
2, 1, 0, 6,
5, 4, 10, 9,
8, 14, 13, 12,
-1, -1, -1, -1,
2, 1, 0, 6,
5, 4, 10, 9,
8, 14, 13, 12,
-1, -1, -1, -1);

Vector256<int> packLanesControl = Vector256.Create(
0, 0, 0, 0,
1, 0, 0, 0,
2, 0, 0, 0,
4, 0, 0, 0,
5, 0, 0, 0,
6, 0, 0, 0,
-1, -1, -1, -1,
-1, -1, -1, -1).AsInt32();

Vector256<sbyte> mask2F = Vector256.Create((sbyte)'/');
Vector256<sbyte> mergeConstant0 = Vector256.Create(0x01400140).AsSByte();
Vector256<short> mergeConstant1 = Vector256.Create(0x00011000).AsInt16();
Vector256<sbyte> packBytesInLaneMask = ReadVector<Vector256<sbyte>>(s_avxDecodePackBytesInLaneMask);
Vector256<int> packLanesControl = ReadVector<Vector256<sbyte>>(s_avxDecodePackLanesControl).AsInt32();

byte* src = srcBytes;
byte* dest = destBytes;
Expand Down Expand Up @@ -508,13 +553,33 @@ private static unsafe void Ssse3Decode(ref byte* srcBytes, ref byte* destBytes,
// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10

// The JIT won't hoist these "constants", so help it
Vector128<sbyte> lutHi = ReadVector<Vector128<sbyte>>(s_sseDecodeLutHi);
Vector128<sbyte> lutLo = ReadVector<Vector128<sbyte>>(s_sseDecodeLutLo);
Vector128<sbyte> lutShift = ReadVector<Vector128<sbyte>>(s_sseDecodeLutShift);
Vector128<sbyte> lutHi = Vector128.Create(
0x10, 0x10, 0x01, 0x02,
0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x10, 0x10);

Vector128<sbyte> lutLo = Vector128.Create(
0x15, 0x11, 0x11, 0x11,
0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A,
0x1B, 0x1B, 0x1B, 0x1A);

Vector128<sbyte> lutShift = Vector128.Create(
0, 16, 19, 4,
-65, -65, -71, -71,
0, 0, 0, 0,
0, 0, 0, 0);

Vector128<sbyte> packBytesMask = Vector128.Create(
2, 1, 0, 6,
5, 4, 10, 9,
8, 14, 13, 12,
-1, -1, -1, -1);

Vector128<sbyte> mask2F = Vector128.Create((sbyte)'/');
Vector128<sbyte> mergeConstant0 = Vector128.Create(0x01400140).AsSByte();
Vector128<short> mergeConstant1 = Vector128.Create(0x00011000).AsInt16();
Vector128<sbyte> packBytesMask = ReadVector<Vector128<sbyte>>(s_sseDecodePackBytesMask);
Vector128<sbyte> zero = Vector128<sbyte>.Zero;

byte* src = srcBytes;
Expand Down Expand Up @@ -613,7 +678,7 @@ private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value)
}

// Pre-computing this table using a custom string(s_characters) and GenerateDecodingMapAndVerify (found in tests)
private static ReadOnlySpan<sbyte> s_decodingMap => new sbyte[] {
private static ReadOnlySpan<sbyte> DecodingMap => new sbyte[] {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, //62 is placed at index 43 (for +), 63 at index 47 (for /)
Expand All @@ -631,88 +696,5 @@ private static unsafe void WriteThreeLowOrderBytes(byte* destination, int value)
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
};

private static ReadOnlySpan<sbyte> s_sseDecodePackBytesMask => new sbyte[] {
2, 1, 0, 6,
5, 4, 10, 9,
8, 14, 13, 12,
-1, -1, -1, -1
};

private static ReadOnlySpan<sbyte> s_sseDecodeLutLo => new sbyte[] {
0x15, 0x11, 0x11, 0x11,
0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A,
0x1B, 0x1B, 0x1B, 0x1A
};

private static ReadOnlySpan<sbyte> s_sseDecodeLutHi => new sbyte[] {
0x10, 0x10, 0x01, 0x02,
0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x10, 0x10
};

private static ReadOnlySpan<sbyte> s_sseDecodeLutShift => new sbyte[] {
0, 16, 19, 4,
-65, -65, -71, -71,
0, 0, 0, 0,
0, 0, 0, 0
};

private static ReadOnlySpan<sbyte> s_avxDecodePackBytesInLaneMask => new sbyte[] {
2, 1, 0, 6,
5, 4, 10, 9,
8, 14, 13, 12,
-1, -1, -1, -1,
2, 1, 0, 6,
5, 4, 10, 9,
8, 14, 13, 12,
-1, -1, -1, -1
};

private static ReadOnlySpan<sbyte> s_avxDecodePackLanesControl => new sbyte[] {
0, 0, 0, 0,
1, 0, 0, 0,
2, 0, 0, 0,
4, 0, 0, 0,
5, 0, 0, 0,
6, 0, 0, 0,
-1, -1, -1, -1,
-1, -1, -1, -1
};

private static ReadOnlySpan<sbyte> s_avxDecodeLutLo => new sbyte[] {
0x15, 0x11, 0x11, 0x11,
0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A,
0x1B, 0x1B, 0x1B, 0x1A,
0x15, 0x11, 0x11, 0x11,
0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A,
0x1B, 0x1B, 0x1B, 0x1A
};

private static ReadOnlySpan<sbyte> s_avxDecodeLutHi => new sbyte[] {
0x10, 0x10, 0x01, 0x02,
0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x01, 0x02,
0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x10, 0x10
};

private static ReadOnlySpan<sbyte> s_avxDecodeLutShift => new sbyte[] {
0, 16, 19, 4,
-65, -65, -71, -71,
0, 0, 0, 0,
0, 0, 0, 0,
0, 16, 19, 4,
-65, -65, -71, -71,
0, 0, 0, 0,
0, 0, 0, 0
};
}
}
Loading

0 comments on commit 738b09b

Please sign in to comment.