Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add sse2 version of select #1804

Merged
merged 3 commits into from
Nov 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 43 additions & 13 deletions src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,7 @@ public static void PredictorInverseTransform(
int mask = tileWidth - 1;
int tilesPerRow = SubSampleSize(width, transform.Bits);
int predictorModeIdxBase = (y >> transform.Bits) * tilesPerRow;
Span<short> scratch = stackalloc short[8];
while (y < yEnd)
{
int predictorModeIdx = predictorModeIdxBase;
Expand Down Expand Up @@ -608,7 +609,7 @@ public static void PredictorInverseTransform(
PredictorAdd10(input + x, output + x - width, xEnd - x, output + x);
break;
case 11:
PredictorAdd11(input + x, output + x - width, xEnd - x, output + x);
PredictorAdd11(input + x, output + x - width, xEnd - x, output + x, scratch);
break;
case 12:
PredictorAdd12(input + x, output + x - width, xEnd - x, output + x);
Expand Down Expand Up @@ -974,11 +975,11 @@ private static void PredictorAdd10(uint* input, uint* upper, int numberOfPixels,
}

[MethodImpl(InliningOptions.ShortMethod)]
private static void PredictorAdd11(uint* input, uint* upper, int numberOfPixels, uint* output)
private static void PredictorAdd11(uint* input, uint* upper, int numberOfPixels, uint* output, Span<short> scratch)
{
for (int x = 0; x < numberOfPixels; x++)
{
uint pred = Predictor11(output[x - 1], upper + x);
uint pred = Predictor11(output[x - 1], upper + x, scratch);
output[x] = AddPixels(input[x], pred);
}
}
Expand Down Expand Up @@ -1031,7 +1032,7 @@ private static void PredictorAdd13(uint* input, uint* upper, int numberOfPixels,
public static uint Predictor10(uint left, uint* top) => Average4(left, top[-1], top[0], top[1]);

[MethodImpl(InliningOptions.ShortMethod)]
public static uint Predictor11(uint left, uint* top) => Select(top[0], left, top[-1]);
public static uint Predictor11(uint left, uint* top, Span<short> scratch) => Select(top[0], left, top[-1], scratch);

[MethodImpl(InliningOptions.ShortMethod)]
public static uint Predictor12(uint left, uint* top) => ClampedAddSubtractFull(left, top[0], top[-1]);
Expand Down Expand Up @@ -1148,11 +1149,11 @@ public static void PredictorSub10(uint* input, uint* upper, int numPixels, uint*
}

[MethodImpl(InliningOptions.ShortMethod)]
public static void PredictorSub11(uint* input, uint* upper, int numPixels, uint* output)
public static void PredictorSub11(uint* input, uint* upper, int numPixels, uint* output, Span<short> scratch)
{
for (int x = 0; x < numPixels; x++)
{
uint pred = Predictor11(input[x - 1], upper + x);
uint pred = Predictor11(input[x - 1], upper + x, scratch);
output[x] = SubPixels(input[x], pred);
}
}
Expand Down Expand Up @@ -1240,14 +1241,43 @@ private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2)
private static Vector128<int> MkCst16(int hi, int lo) => Vector128.Create((hi << 16) | (lo & 0xffff));
#endif

private static uint Select(uint a, uint b, uint c)
private static uint Select(uint a, uint b, uint c, Span<short> scratch)
{
int paMinusPb =
Sub3((int)(a >> 24), (int)(b >> 24), (int)(c >> 24)) +
Sub3((int)((a >> 16) & 0xff), (int)((b >> 16) & 0xff), (int)((c >> 16) & 0xff)) +
Sub3((int)((a >> 8) & 0xff), (int)((b >> 8) & 0xff), (int)((c >> 8) & 0xff)) +
Sub3((int)(a & 0xff), (int)(b & 0xff), (int)(c & 0xff));
return paMinusPb <= 0 ? a : b;
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse2.IsSupported)
{
Span<short> output = scratch;
fixed (short* p = output)
{
Vector128<byte> a0 = Sse2.ConvertScalarToVector128UInt32(a).AsByte();
Vector128<byte> b0 = Sse2.ConvertScalarToVector128UInt32(b).AsByte();
Vector128<byte> c0 = Sse2.ConvertScalarToVector128UInt32(c).AsByte();
Vector128<byte> ac0 = Sse2.SubtractSaturate(a0, c0);
Vector128<byte> ca0 = Sse2.SubtractSaturate(c0, a0);
Vector128<byte> bc0 = Sse2.SubtractSaturate(b0, c0);
Vector128<byte> cb0 = Sse2.SubtractSaturate(c0, b0);
Vector128<byte> ac = Sse2.Or(ac0, ca0);
Vector128<byte> bc = Sse2.Or(bc0, cb0);
Vector128<byte> pa = Sse2.UnpackLow(ac, Vector128<byte>.Zero); // |a - c|
Vector128<byte> pb = Sse2.UnpackLow(bc, Vector128<byte>.Zero); // |b - c|
Vector128<ushort> diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16());
Sse2.Store((ushort*)p, diff);
}

int paMinusPb = output[0] + output[1] + output[2] + output[3];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can put this into the fixed-block and access it via the pointer to avoid bound checks*.
If output would be too small, then there's a bug somewhere else 😉 (fortunately there's none).

* or reverse the order to read output[3] first, then [2], ... thant there's only one bound-check.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahhh, too late....

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or reverse the order to read output[3] first, then [2], ... thant there's only one bound-check.

ah yeah, always forget about that trick, thx. Will do with a follow up PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll put it (incl. the return) within the fixed block here.


return (paMinusPb <= 0) ? a : b;
}
else
#endif
{
int paMinusPb =
Sub3((int)(a >> 24), (int)(b >> 24), (int)(c >> 24)) +
Sub3((int)((a >> 16) & 0xff), (int)((b >> 16) & 0xff), (int)((c >> 16) & 0xff)) +
Sub3((int)((a >> 8) & 0xff), (int)((b >> 8) & 0xff), (int)((c >> 8) & 0xff)) +
Sub3((int)(a & 0xff), (int)(b & 0xff), (int)(c & 0xff));
return paMinusPb <= 0 ? a : b;
}
}

[MethodImpl(InliningOptions.ShortMethod)]
Expand Down
27 changes: 17 additions & 10 deletions src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ public static void ResidualImage(
int tilesPerRow = LosslessUtils.SubSampleSize(width, bits);
int tilesPerCol = LosslessUtils.SubSampleSize(height, bits);
int maxQuantization = 1 << LosslessUtils.NearLosslessBits(nearLosslessQuality);
Span<short> scratch = stackalloc short[8];

// TODO: Can we optimize this?
int[][] histo = new int[4][];
Expand Down Expand Up @@ -84,7 +85,8 @@ public static void ResidualImage(
transparentColorMode,
usedSubtractGreen,
nearLossless,
image);
image,
scratch);

image[(tileY * tilesPerRow) + tileX] = (uint)(WebpConstants.ArgbBlack | (pred << 8));
}
Expand Down Expand Up @@ -192,7 +194,8 @@ private static int GetBestPredictorForTile(
WebpTransparentColorMode transparentColorMode,
bool usedSubtractGreen,
bool nearLossless,
Span<uint> modes)
Span<uint> modes,
Span<short> scratch)
{
const int numPredModes = 14;
int startX = tileX << bits;
Expand Down Expand Up @@ -272,7 +275,7 @@ private static int GetBestPredictorForTile(
}
}

GetResidual(width, height, upperRow, currentRow, maxDiffs, mode, startX, startX + maxX, y, maxQuantization, transparentColorMode, usedSubtractGreen, nearLossless, residuals);
GetResidual(width, height, upperRow, currentRow, maxDiffs, mode, startX, startX + maxX, y, maxQuantization, transparentColorMode, usedSubtractGreen, nearLossless, residuals, scratch);
for (int relativeX = 0; relativeX < maxX; ++relativeX)
{
UpdateHisto(histoArgb, residuals[relativeX]);
Expand Down Expand Up @@ -333,11 +336,12 @@ private static void GetResidual(
WebpTransparentColorMode transparentColorMode,
bool usedSubtractGreen,
bool nearLossless,
Span<uint> output)
Span<uint> output,
Span<short> scratch)
{
if (transparentColorMode == WebpTransparentColorMode.Preserve)
{
PredictBatch(mode, xStart, y, xEnd - xStart, currentRowSpan, upperRowSpan, output);
PredictBatch(mode, xStart, y, xEnd - xStart, currentRowSpan, upperRowSpan, output, scratch);
}
else
{
Expand Down Expand Up @@ -395,7 +399,7 @@ private static void GetResidual(
predict = LosslessUtils.Predictor10(currentRow[x - 1], upperRow + x);
break;
case 11:
predict = LosslessUtils.Predictor11(currentRow[x - 1], upperRow + x);
predict = LosslessUtils.Predictor11(currentRow[x - 1], upperRow + x, scratch);
break;
case 12:
predict = LosslessUtils.Predictor12(currentRow[x - 1], upperRow + x);
Expand Down Expand Up @@ -583,6 +587,7 @@ private static void CopyImageWithPrediction(
Span<byte> currentMaxDiffs = MemoryMarshal.Cast<uint, byte>(currentRow.Slice(width + 1));

Span<byte> lowerMaxDiffs = currentMaxDiffs.Slice(width);
Span<short> scratch = stackalloc short[8];
for (int y = 0; y < height; y++)
{
Span<uint> tmp32 = upperRow;
Expand All @@ -593,7 +598,7 @@ private static void CopyImageWithPrediction(

if (lowEffort)
{
PredictBatch(PredLowEffort, 0, y, width, currentRow, upperRow, argb.Slice(y * width));
PredictBatch(PredLowEffort, 0, y, width, currentRow, upperRow, argb.Slice(y * width), scratch);
}
else
{
Expand Down Expand Up @@ -634,7 +639,8 @@ private static void CopyImageWithPrediction(
transparentColorMode,
usedSubtractGreen,
nearLossless,
argb.Slice((y * width) + x));
argb.Slice((y * width) + x),
scratch);

x = xEnd;
}
Expand All @@ -649,7 +655,8 @@ private static void PredictBatch(
int numPixels,
Span<uint> currentSpan,
Span<uint> upperSpan,
Span<uint> outputSpan)
Span<uint> outputSpan,
Span<short> scratch)
{
#pragma warning disable SA1503 // Braces should not be omitted
fixed (uint* current = currentSpan)
Expand Down Expand Up @@ -718,7 +725,7 @@ private static void PredictBatch(
LosslessUtils.PredictorSub10(current + xStart, upper + xStart, numPixels, output);
break;
case 11:
LosslessUtils.PredictorSub11(current + xStart, upper + xStart, numPixels, output);
LosslessUtils.PredictorSub11(current + xStart, upper + xStart, numPixels, output, scratch);
break;
case 12:
LosslessUtils.PredictorSub12(current + xStart, upper + xStart, numPixels, output);
Expand Down
30 changes: 30 additions & 0 deletions tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,30 @@ private static void RunTransformColorInverseTest()
Assert.Equal(expectedOutput, pixelData);
}

private static void RunPredictor11Test()
{
// arrange
uint[] topData = { 4278258949, 4278258949 };
uint left = 4294839812;
short[] scratch = new short[8];
uint expectedResult = 4294839812;

// act
unsafe
{
fixed (uint* top = &topData[1])
{
uint actual = LosslessUtils.Predictor11(left, top, scratch);

// assert
Assert.Equal(expectedResult, actual);
}
}
}

[Fact]
public void Predictor11_Works() => RunPredictor11Test();

[Fact]
public void SubtractGreen_Works() => RunSubtractGreenTest();

Expand All @@ -145,6 +169,12 @@ private static void RunTransformColorInverseTest()
public void TransformColorInverse_Works() => RunTransformColorInverseTest();

#if SUPPORTS_RUNTIME_INTRINSICS
[Fact]
public void Predictor11_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor11Test, HwIntrinsics.AllowAll);

[Fact]
public void Predictor11_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor11Test, HwIntrinsics.DisableSSE2);

[Fact]
public void SubtractGreen_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.AllowAll);

Expand Down