Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve XmlDictionaryWriter UTF8 encoding performance #73336

Merged
merged 27 commits into from
Apr 4, 2023
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
5d09005
Speed up text encoding
Daniel-Svensson Jul 7, 2022
63c760c
Update implementation
Daniel-Svensson Jul 18, 2022
196ce48
Add tests for binary xml strings
Daniel-Svensson Jul 26, 2022
65e7029
Merge branch 'binary_xml_text' of https://github.com/Daniel-Svensson/…
Daniel-Svensson Jul 26, 2022
4d8078a
limit counting code to 256 bit vectors
Daniel-Svensson Jul 26, 2022
6e5aabb
reword comment
Daniel-Svensson Aug 3, 2022
70fa189
rename test
Daniel-Svensson Aug 3, 2022
b34d259
move bytesmax
Daniel-Svensson Aug 3, 2022
5df5ae0
Fix bytesMax after moving variable initialization
Daniel-Svensson Aug 4, 2022
a790fbb
use unicode escape value in test
Daniel-Svensson Aug 4, 2022
2b82ac8
fix test typo "*" -> "+"
Daniel-Svensson Aug 4, 2022
301e531
Update src/libraries/System.Private.DataContractSerialization/src/Sys…
Daniel-Svensson Aug 12, 2022
5a21306
Remvoe vectorized code from UnsafeGetUTF8Length
Daniel-Svensson Aug 12, 2022
8a3de26
Merge remote-tracking branch 'upstream/main' into binary_xml_text
Daniel-Svensson Aug 13, 2022
048cade
Fix overfload
Daniel-Svensson Sep 8, 2022
8297311
Merge commit '080f708e7018f6c0529b6c875a44d84fc4d74419' into binary_x…
Daniel-Svensson Oct 24, 2022
287e737
use for loop which seems faster
Daniel-Svensson Oct 24, 2022
0d2a9bb
merge up to net8 preview1
Daniel-Svensson Mar 3, 2023
ab29682
remove vector loop
Daniel-Svensson Mar 6, 2023
251391f
make sealed encoding to allow devirtualisation
Daniel-Svensson Mar 11, 2023
a590739
back some changes
Daniel-Svensson Mar 20, 2023
46b6314
use uint for UnsafeGetUTF8Chars comparison
Daniel-Svensson Mar 25, 2023
82f8880
revert more changes
Daniel-Svensson Mar 26, 2023
d78aade
Fix cutoff based on new measurements
Daniel-Svensson Mar 26, 2023
3b20be8
use BinaryPrimitives.ReverseEndianness as suggested
Daniel-Svensson Mar 26, 2023
9c86b05
Update cutoff from 24 to 32 chars before calling, due to regression f…
Daniel-Svensson Mar 27, 2023
ccfb008
Remove sealed encoding since it only improves XmlConvert
Daniel-Svensson Apr 2, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,12 @@
<Reference Include="System.Collections.Specialized" />
<Reference Include="System.Linq" />
<Reference Include="System.Memory" />
<Reference Include="System.Numerics.Vectors" />
<Reference Include="System.Reflection.Emit.ILGeneration" />
<Reference Include="System.Reflection.Emit.Lightweight" />
<Reference Include="System.Reflection.Primitives" />
<Reference Include="System.Runtime" />
<Reference Include="System.Runtime.Intrinsics" />
<Reference Include="System.Runtime.Serialization.Formatters" />
<Reference Include="System.Runtime.Serialization.Primitives" />
<Reference Include="System.Text.Encoding.Extensions" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
// The .NET Foundation licenses this file to you under the MIT license.

using System.IO;
using System.Numerics;
using System.Text;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Runtime.Serialization;
using System.Threading.Tasks;

Expand Down Expand Up @@ -56,18 +59,6 @@ public int Position
}
}

private int GetByteCount(char[] chars)
{
if (_encoding == null)
{
return s_UTF8Encoding.GetByteCount(chars);
}
else
{
return _encoding.GetByteCount(chars);
}
}

protected byte[] GetBuffer(int count, out int offset)
{
DiagnosticUtility.DebugAssert(count >= 0 && count <= bufferLength, "");
Expand Down Expand Up @@ -344,37 +335,54 @@ protected unsafe void UnsafeWriteUnicodeChars(char* chars, int charCount)

protected unsafe int UnsafeGetUnicodeChars(char* chars, int charCount, byte[] buffer, int offset)
{
char* charsMax = chars + charCount;
while (chars < charsMax)
if (BitConverter.IsLittleEndian)
{
char value = *chars++;
buffer[offset++] = (byte)value;
value >>= 8;
buffer[offset++] = (byte)value;
new ReadOnlySpan<byte>((byte*)chars, 2 * charCount)
.CopyTo(buffer.AsSpan(offset));
Daniel-Svensson marked this conversation as resolved.
Show resolved Hide resolved
}
else
{
char* charsMax = chars + charCount;
while (chars < charsMax)
{
char value = *chars++;
buffer[offset++] = (byte)value;
buffer[offset++] = (byte)(value >> 8);
}
Daniel-Svensson marked this conversation as resolved.
Show resolved Hide resolved
}

return charCount * 2;
}

protected unsafe int UnsafeGetUTF8Length(char* chars, int charCount)
{
char* charsMax = chars + charCount;
while (chars < charsMax)

// This method is only called from 2 places and will use length of at least (128/3 and 256/3) respectivly
// We avoid Vector<T> sine it is unsure how downclocking due to AVX512 would affect total throughput
if (Vector256.IsHardwareAccelerated
&& Vector256<short>.Count < charCount && charCount <= 2048)
Daniel-Svensson marked this conversation as resolved.
Show resolved Hide resolved
{
if (*chars >= 0x80)
break;
char* lastSimd = chars + charCount - Vector256<short>.Count;
Vector256<short> mask = Vector256.Create(unchecked((short)0xff80));

chars++;
}
while (chars < lastSimd)
{
if (((*(Vector256<short>*)chars) & mask) != Vector256<short>.Zero)
goto NonAscii;

if (chars == charsMax)
return charCount;
chars += Vector256<short>.Count;
}

char[] chArray = new char[charsMax - chars];
for (int i = 0; i < chArray.Length; i++)
{
chArray[i] = chars[i];
if ((*(Vector256<short>*)lastSimd & mask) == Vector256<short>.Zero)
return charCount;
}
return (int)(chars - (charsMax - charCount)) + GetByteCount(chArray);

NonAscii:
int numRemaining = (int)(charsMax - chars);
int numAscii = charCount - numRemaining;

return numAscii + (_encoding ?? s_UTF8Encoding).GetByteCount(chars, numRemaining);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are the possible values of _encoding? Can it be something other than Utf8?

Note that it better to call Encoding.UTF8.GetBytes directly without caching the encoding locally. Encoding.UTF8.GetBytes allows devitalization optimization to kick in that eliminates the overhead of Encoding being an abstract type.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It can be passed by the user when creating a text XmlDictionaryWriter, but it is only set to _encoding if the codepage is the same as utf8.
So in theory it can be any encoding class even if unlikely .

for s_encoding it does not use the default constructor but passes is (false, true) so I did no dare to do that change.
If it does not change the behaviour then that can be a simple follow up fix.

}

protected unsafe int UnsafeGetUTF8Chars(char* chars, int charCount, byte[] buffer, int offset)
Expand All @@ -384,38 +392,51 @@ protected unsafe int UnsafeGetUTF8Chars(char* chars, int charCount, byte[] buffe
fixed (byte* _bytes = &buffer[offset])
{
byte* bytes = _bytes;
byte* bytesMax = &bytes[buffer.Length - offset];
char* charsMax = &chars[charCount];

while (true)
if (Sse41.IsSupported && charCount >= Vector128<short>.Count)
Daniel-Svensson marked this conversation as resolved.
Show resolved Hide resolved
Daniel-Svensson marked this conversation as resolved.
Show resolved Hide resolved
{
Vector128<short> mask = Vector128.Create(unchecked((short)0xff80));
char* simdLast = chars + charCount - Vector128<short>.Count;

while (chars < simdLast)
{
Vector128<short> v = *(Vector128<short>*)chars;
if (!Sse41.TestZ(v, mask))
goto NonAscii;

Sse2.StoreScalar((long*)bytes, Sse2.PackUnsignedSaturate(v, v).AsInt64());
bytes += Vector128<short>.Count;
chars += Vector128<short>.Count;
}

Vector128<short> v2 = Sse2.LoadVector128((short*)simdLast);
if (!Sse41.TestZ(v2, mask))
goto NonAscii;

Sse2.StoreScalar((long*)(_bytes + charCount - sizeof(long)), Sse2.PackUnsignedSaturate(v2, v2).AsInt64());
return charCount;
}
// Fast path for small strings, skip and use Encoding.GetBytes for larger strings since it is faster even for the all-Ascii case
else if (charCount < 16)
{
while (chars < charsMax)
{
char t = *chars;
if (t >= 0x80)
break;
goto NonAscii;

*bytes = (byte)t;
bytes++;
chars++;
}

if (chars >= charsMax)
break;

char* charsStart = chars;
while (chars < charsMax && *chars >= 0x80)
{
chars++;
}

bytes += (_encoding ?? s_UTF8Encoding).GetBytes(charsStart, (int)(chars - charsStart), bytes, (int)(bytesMax - bytes));

if (chars >= charsMax)
break;
return charCount;
}

return (int)(bytes - _bytes);
NonAscii:
byte* bytesMax = _bytes + buffer.Length - offset;
return (int)(bytes - _bytes) + (_encoding ?? s_UTF8Encoding).GetBytes(chars, (int)(charsMax - chars), bytes, (int)(bytesMax - bytes));
}
}
return 0;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
Microsoft Visual Studio Solution File, Format Version 12.00

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.2.32616.157
MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TestUtilities", "..\Common\tests\TestUtilities\TestUtilities.csproj", "{CBA80130-6773-4DF9-995C-DC6CBED89CB5}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Win32.Primitives", "..\Microsoft.Win32.Primitives\ref\Microsoft.Win32.Primitives.csproj", "{E5DB95E1-94AA-405C-9FFE-09B1E2498EE2}"
Expand Down Expand Up @@ -45,6 +49,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{DB29DBEF-FA4
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "gen", "gen", "{DE71D38E-4154-477C-9C27-3FA4ADB4098F}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "System.Numerics.Vectors", "..\System.Numerics.Vectors\ref\System.Numerics.Vectors.csproj", "{EFE0C13B-6902-4FC9-91DD-F180420B36C8}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "System.Runtime.Intrinsics", "..\System.Runtime.Intrinsics\ref\System.Runtime.Intrinsics.csproj", "{7F8A8D87-B49A-4C7B-8474-AC83F8CFD40B}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -127,30 +135,40 @@ Global
{DF2255F4-F671-4C15-9100-D8079992E19D}.Debug|Any CPU.Build.0 = Debug|Any CPU
{DF2255F4-F671-4C15-9100-D8079992E19D}.Release|Any CPU.ActiveCfg = Release|Any CPU
{DF2255F4-F671-4C15-9100-D8079992E19D}.Release|Any CPU.Build.0 = Release|Any CPU
{EFE0C13B-6902-4FC9-91DD-F180420B36C8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{EFE0C13B-6902-4FC9-91DD-F180420B36C8}.Debug|Any CPU.Build.0 = Debug|Any CPU
{EFE0C13B-6902-4FC9-91DD-F180420B36C8}.Release|Any CPU.ActiveCfg = Release|Any CPU
{EFE0C13B-6902-4FC9-91DD-F180420B36C8}.Release|Any CPU.Build.0 = Release|Any CPU
{7F8A8D87-B49A-4C7B-8474-AC83F8CFD40B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{7F8A8D87-B49A-4C7B-8474-AC83F8CFD40B}.Debug|Any CPU.Build.0 = Debug|Any CPU
{7F8A8D87-B49A-4C7B-8474-AC83F8CFD40B}.Release|Any CPU.ActiveCfg = Release|Any CPU
{7F8A8D87-B49A-4C7B-8474-AC83F8CFD40B}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(NestedProjects) = preSolution
{CBA80130-6773-4DF9-995C-DC6CBED89CB5} = {41101B02-36C9-476B-98D5-1A6E105BBF4A}
{8B069551-9B95-464E-BB40-C56817506FEC} = {41101B02-36C9-476B-98D5-1A6E105BBF4A}
{8FF5E841-29F6-4DB7-A4F8-9281FBDA0B9C} = {41101B02-36C9-476B-98D5-1A6E105BBF4A}
{E5DB95E1-94AA-405C-9FFE-09B1E2498EE2} = {18E62E91-73A2-48AE-BEFF-CE7C64DF759D}
{7DF41C40-FE5D-41DF-B106-3DD77BE4D4B5} = {18E62E91-73A2-48AE-BEFF-CE7C64DF759D}
{1392041A-E2CA-4553-BEAF-363974651B81} = {DB29DBEF-FA4E-4334-AFB8-BFB2DA82D1DE}
{E3347E75-EAE8-4E6B-98D1-7230B1EE5450} = {18E62E91-73A2-48AE-BEFF-CE7C64DF759D}
{5EE18CED-28AE-4415-B5A3-C31123BF57E1} = {18E62E91-73A2-48AE-BEFF-CE7C64DF759D}
{E813073E-07A7-4C88-A505-484CB33C9DC4} = {18E62E91-73A2-48AE-BEFF-CE7C64DF759D}
{76AC3DDD-2B38-489F-A8B0-8E43054595DB} = {DB29DBEF-FA4E-4334-AFB8-BFB2DA82D1DE}
{7D7457FD-B88C-4375-926D-7D46C71E34A7} = {DE71D38E-4154-477C-9C27-3FA4ADB4098F}
{D5FF2DBA-F304-4ACB-8F82-B8F9321E22A9} = {DE71D38E-4154-477C-9C27-3FA4ADB4098F}
{DAD8EBB8-A1D6-4E8F-A334-D7F0273280D1} = {18E62E91-73A2-48AE-BEFF-CE7C64DF759D}
{0C045A64-AE30-47CC-A931-5B5C6C9EF06D} = {18E62E91-73A2-48AE-BEFF-CE7C64DF759D}
{19F785D2-F7A4-41AB-9301-A6AD7E40B238} = {18E62E91-73A2-48AE-BEFF-CE7C64DF759D}
{9759BE1C-98A0-4319-AC82-D432002BD66B} = {18E62E91-73A2-48AE-BEFF-CE7C64DF759D}
{DF2255F4-F671-4C15-9100-D8079992E19D} = {18E62E91-73A2-48AE-BEFF-CE7C64DF759D}
{1392041A-E2CA-4553-BEAF-363974651B81} = {DB29DBEF-FA4E-4334-AFB8-BFB2DA82D1DE}
{76AC3DDD-2B38-489F-A8B0-8E43054595DB} = {DB29DBEF-FA4E-4334-AFB8-BFB2DA82D1DE}
{6FD10BE0-24C8-456E-8B9A-FD101C05C961} = {DB29DBEF-FA4E-4334-AFB8-BFB2DA82D1DE}
{7D7457FD-B88C-4375-926D-7D46C71E34A7} = {DE71D38E-4154-477C-9C27-3FA4ADB4098F}
{D5FF2DBA-F304-4ACB-8F82-B8F9321E22A9} = {DE71D38E-4154-477C-9C27-3FA4ADB4098F}
{8B069551-9B95-464E-BB40-C56817506FEC} = {41101B02-36C9-476B-98D5-1A6E105BBF4A}
{8FF5E841-29F6-4DB7-A4F8-9281FBDA0B9C} = {41101B02-36C9-476B-98D5-1A6E105BBF4A}
{9759BE1C-98A0-4319-AC82-D432002BD66B} = {18E62E91-73A2-48AE-BEFF-CE7C64DF759D}
{6E942A4A-405E-4AAD-89A7-006358A8A004} = {DE71D38E-4154-477C-9C27-3FA4ADB4098F}
{DF2255F4-F671-4C15-9100-D8079992E19D} = {18E62E91-73A2-48AE-BEFF-CE7C64DF759D}
{EFE0C13B-6902-4FC9-91DD-F180420B36C8} = {18E62E91-73A2-48AE-BEFF-CE7C64DF759D}
{7F8A8D87-B49A-4C7B-8474-AC83F8CFD40B} = {18E62E91-73A2-48AE-BEFF-CE7C64DF759D}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {EE9FB522-4B73-4E3E-B63D-C21826BB7B5D}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Runtime.Serialization;
Expand Down Expand Up @@ -320,6 +321,71 @@ public static void FragmentTest()
Assert.False(FragmentHelper.CanFragment(writer));
}

[Fact]
public static void XmlBaseWriter_WriteString()
{
const byte Chars8Text = 152;
const byte Chars16Text = 154;
MemoryStream ms = new MemoryStream();
XmlDictionaryWriter writer = (XmlDictionaryWriter)XmlDictionaryWriter.CreateBinaryWriter(ms);
writer.WriteStartElement("root");

int[] lengths = new[] { 7, 8, 9, 15, 16, 17, 31, 32, 36, 258 };
byte[] buffer = new byte[lengths.Max() + 1];

foreach (var length in lengths)
{
string allAscii = string.Create(length, null, (Span<char> chars, object _) =>
{
for (int i = 0; i < chars.Length; ++i)
chars[i] = (char)(i % 128);
});
string multiByteLast = string.Create(length, null, (Span<char> chars, object _) =>
{
for (int i = 0; i < chars.Length; ++i)
chars[i] = (char)(i % 128);
chars[^1] = '\u00E4'; // '�' - Latin Small Letter a with Diaeresis. Latin-1 Supplement.
});

int numBytes = Encoding.UTF8.GetBytes(allAscii, buffer);
Assert.True(numBytes == length, "Test setup wrong - allAscii");
ValidateWriteText(ms, writer, allAscii, expected: buffer.AsSpan(0, numBytes));

numBytes = Encoding.UTF8.GetBytes(multiByteLast, buffer);
Assert.True(numBytes == length + 1, "Test setup wrong - multiByte");
ValidateWriteText(ms, writer, multiByteLast, expected: buffer.AsSpan(0, numBytes));
}

static void ValidateWriteText(MemoryStream ms, XmlDictionaryWriter writer, string text, ReadOnlySpan<byte> expected)
{
writer.Flush();
ms.Seek(0, SeekOrigin.Begin);
ms.SetLength(0);
writer.WriteString(text);
writer.Flush();

ms.TryGetBuffer(out ArraySegment<byte> arraySegment);
ReadOnlySpan<byte> buffer = arraySegment;

if (expected.Length <= byte.MaxValue)
{
Assert.Equal(Chars8Text, buffer[0]);
Assert.Equal(expected.Length, buffer[1]);
buffer = buffer.Slice(2);
}
else if (expected.Length <= ushort.MaxValue)
{
Assert.Equal(Chars16Text, buffer[0]);
Assert.Equal(expected.Length, (int)(buffer[1]) | ((int)buffer[2] << 8));
buffer = buffer.Slice(3);
}
else
Assert.Fail("test use to long length");

AssertExtensions.SequenceEqual(expected, buffer);
}
}

private static bool ReadTest(MemoryStream ms, Encoding encoding, ReaderWriterFactory.ReaderWriterType rwType, byte[] byteArray)
{
ms.Position = 0;
Expand Down