diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 1b9e1f3ca1..af5f136fa9 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -18,12 +18,24 @@ internal static class LossyUtils #if SUPPORTS_RUNTIME_INTRINSICS private static readonly Vector128 Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte(); + private static readonly Vector128 SignBit = Vector128.Create((byte)0x80); + + private static readonly Vector128 Three = Vector128.Create((byte)3).AsSByte(); + + private static readonly Vector128 FourShort = Vector128.Create((short)4); + + private static readonly Vector128 FourSByte = Vector128.Create((byte)4).AsSByte(); + + private static readonly Vector128 Nine = Vector128.Create((short)0x0900).AsSByte(); + + private static readonly Vector128 SixtyThree = Vector128.Create((short)63).AsSByte(); + + private static readonly Vector128 SixtyFour = Vector128.Create((byte)64).AsSByte(); + private static readonly Vector128 K1 = Vector128.Create((short)20091); private static readonly Vector128 K2 = Vector128.Create((short)-30068); - private static readonly Vector128 Four = Vector128.Create((short)4); - #endif // Note: method name in libwebp reference implementation is called VP8SSE16x16. @@ -908,7 +920,7 @@ public static void TransformTwo(Span src, Span dst, Span scrat // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 dc = Sse2.Add(t0.AsInt16(), Four); + Vector128 dc = Sse2.Add(t0.AsInt16(), FourShort); a = Sse2.Add(dc, t2.AsInt16()); b = Sse2.Subtract(dc, t2.AsInt16()); @@ -1029,7 +1041,7 @@ public static void TransformOne(Span src, Span dst, Span scrat // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 dc = Sse2.Add(t0.AsInt16(), Four); + Vector128 dc = Sse2.Add(t0.AsInt16(), FourShort); a = Sse2.Add(dc, t2.AsInt16()); b = Sse2.Subtract(dc, t2.AsInt16()); @@ -1207,71 +1219,292 @@ public static void TransformDcuv(Span src, Span dst) // Simple In-loop filtering (Paragraph 15.2) public static void SimpleVFilter16(Span p, int offset, int stride, int thresh) { - int thresh2 = (2 * thresh) + 1; - int end = 16 + offset; - for (int i = offset; i < end; i++) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - if (NeedsFilter(p, i, stride, thresh2)) + // Load. + ref byte pRef = ref Unsafe.Add(ref MemoryMarshal.GetReference(p), offset); + + Vector128 p1 = Unsafe.As>(ref Unsafe.Subtract(ref pRef, 2 * stride)); + Vector128 p0 = Unsafe.As>(ref Unsafe.Subtract(ref pRef, stride)); + Vector128 q0 = Unsafe.As>(ref pRef); + Vector128 q1 = Unsafe.As>(ref Unsafe.Add(ref pRef, stride)); + + DoFilter2Sse2(ref p1, ref p0, ref q0, ref q1, thresh); + + // Store. + ref byte outputRef = ref Unsafe.Add(ref MemoryMarshal.GetReference(p), offset); + Unsafe.As>(ref Unsafe.Subtract(ref outputRef, stride)) = p0.AsSByte(); + Unsafe.As>(ref outputRef) = q0.AsSByte(); + } + else +#endif + { + int thresh2 = (2 * thresh) + 1; + int end = 16 + offset; + for (int i = offset; i < end; i++) { - DoFilter2(p, i, stride); + if (NeedsFilter(p, i, stride, thresh2)) + { + DoFilter2(p, i, stride); + } } } } public static void SimpleHFilter16(Span p, int offset, int stride, int thresh) { - int thresh2 = (2 * thresh) + 1; - int end = offset + (16 * stride); - for (int i = offset; i < end; i += stride) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - if (NeedsFilter(p, i, 1, thresh2)) + // Beginning of p1 + ref byte pRef = ref Unsafe.Add(ref MemoryMarshal.GetReference(p), offset - 2); + + Load16x4(ref pRef, ref Unsafe.Add(ref pRef, 8 * stride), stride, out Vector128 p1, out Vector128 p0, out Vector128 q0, out Vector128 q1); + DoFilter2Sse2(ref p1, ref p0, ref q0, ref q1, thresh); + Store16x4(p1, p0, q0, q1, ref pRef, ref Unsafe.Add(ref pRef, 8 * stride), stride); + } + else +#endif + { + int thresh2 = (2 * thresh) + 1; + int end = offset + (16 * stride); + for (int i = offset; i < end; i += stride) { - DoFilter2(p, i, 1); + if (NeedsFilter(p, i, 1, thresh2)) + { + DoFilter2(p, i, 1); + } } } } public static void SimpleVFilter16i(Span p, int offset, int stride, int thresh) { - for (int k = 3; k > 0; --k) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - offset += 4 * stride; - SimpleVFilter16(p, offset, stride, thresh); + for (int k = 3; k > 0; k--) + { + offset += 4 * stride; + SimpleVFilter16(p, offset, stride, thresh); + } + } + else +#endif + { + for (int k = 3; k > 0; k--) + { + offset += 4 * stride; + SimpleVFilter16(p, offset, stride, thresh); + } } } public static void SimpleHFilter16i(Span p, int offset, int stride, int thresh) { - for (int k = 3; k > 0; --k) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - offset += 4; - SimpleHFilter16(p, offset, stride, thresh); + for (int k = 3; k > 0; k--) + { + offset += 4; + SimpleHFilter16(p, offset, stride, thresh); + } + } + else +#endif + { + for (int k = 3; k > 0; k--) + { + offset += 4; + SimpleHFilter16(p, offset, stride, thresh); + } } } + // On macroblock edges. [MethodImpl(InliningOptions.ShortMethod)] public static void VFilter16(Span p, int offset, int stride, int thresh, int ithresh, int hevThresh) - => FilterLoop26(p, offset, stride, 1, 16, thresh, ithresh, hevThresh); + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { + ref byte pRef = ref MemoryMarshal.GetReference(p); + Vector128 t1 = Unsafe.As>(ref Unsafe.Add(ref pRef, offset - (4 * stride))); + Vector128 p2 = Unsafe.As>(ref Unsafe.Add(ref pRef, offset - (3 * stride))); + Vector128 p1 = Unsafe.As>(ref Unsafe.Add(ref pRef, offset - (2 * stride))); + Vector128 p0 = Unsafe.As>(ref Unsafe.Add(ref pRef, offset - stride)); + + Vector128 mask = Abs(p1, p0); + mask = Sse2.Max(mask, Abs(t1, p2)); + mask = Sse2.Max(mask, Abs(p2, p1)); + + Vector128 q0 = Unsafe.As>(ref Unsafe.Add(ref pRef, offset)); + Vector128 q1 = Unsafe.As>(ref Unsafe.Add(ref pRef, offset + stride)); + Vector128 q2 = Unsafe.As>(ref Unsafe.Add(ref pRef, offset + (2 * stride))); + t1 = Unsafe.As>(ref Unsafe.Add(ref pRef, offset + (3 * stride))); + + mask = Sse2.Max(mask, Abs(q1, q0)); + mask = Sse2.Max(mask, Abs(t1, q2)); + mask = Sse2.Max(mask, Abs(q2, q1)); + + ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); + + // Store. + ref byte outputRef = ref MemoryMarshal.GetReference(p); + Unsafe.As>(ref Unsafe.Add(ref outputRef, offset - (3 * stride))) = p2.AsInt32(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, offset - (2 * stride))) = p1.AsInt32(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, offset - stride)) = p0.AsInt32(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, offset)) = q0.AsInt32(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, offset + stride)) = q1.AsInt32(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, offset + (2 * stride))) = q2.AsInt32(); + } + else +#endif + { + FilterLoop26(p, offset, stride, 1, 16, thresh, ithresh, hevThresh); + } + } [MethodImpl(InliningOptions.ShortMethod)] public static void HFilter16(Span p, int offset, int stride, int thresh, int ithresh, int hevThresh) - => FilterLoop26(p, offset, 1, stride, 16, thresh, ithresh, hevThresh); + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { + ref byte pRef = ref MemoryMarshal.GetReference(p); + ref byte bRef = ref Unsafe.Add(ref pRef, offset - 4); + Load16x4(ref bRef, ref Unsafe.Add(ref bRef, 8 * stride), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); + + Vector128 mask = Abs(p1, p0); + mask = Sse2.Max(mask, Abs(p3, p2)); + mask = Sse2.Max(mask, Abs(p2, p1)); + + Load16x4(ref Unsafe.Add(ref pRef, offset), ref Unsafe.Add(ref pRef, offset + (8 * stride)), stride, out Vector128 q0, out Vector128 q1, out Vector128 q2, out Vector128 q3); + + mask = Sse2.Max(mask, Abs(q1, q0)); + mask = Sse2.Max(mask, Abs(q3, q2)); + mask = Sse2.Max(mask, Abs(q2, q1)); + + ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); + + Store16x4(p3, p2, p1, p0, ref bRef, ref Unsafe.Add(ref bRef, 8 * stride), stride); + Store16x4(q0, q1, q2, q3, ref Unsafe.Add(ref pRef, offset), ref Unsafe.Add(ref pRef, offset + (8 * stride)), stride); + } + else +#endif + { + FilterLoop26(p, offset, 1, stride, 16, thresh, ithresh, hevThresh); + } + } public static void VFilter16i(Span p, int offset, int stride, int thresh, int ithresh, int hevThresh) { - for (int k = 3; k > 0; --k) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - offset += 4 * stride; - FilterLoop24(p, offset, stride, 1, 16, thresh, ithresh, hevThresh); + ref byte pRef = ref MemoryMarshal.GetReference(p); + Vector128 p3 = Unsafe.As>(ref Unsafe.Add(ref pRef, offset)); + Vector128 p2 = Unsafe.As>(ref Unsafe.Add(ref pRef, offset + stride)); + Vector128 p1 = Unsafe.As>(ref Unsafe.Add(ref pRef, offset + (2 * stride))); + Vector128 p0 = Unsafe.As>(ref Unsafe.Add(ref pRef, offset + (3 * stride))); + + for (int k = 3; k > 0; k--) + { + // Beginning of p1. + Span b = p.Slice(offset + (2 * stride)); + offset += 4 * stride; + + Vector128 mask = Abs(p0, p1); + mask = Sse2.Max(mask, Abs(p3, p2)); + mask = Sse2.Max(mask, Abs(p2, p1)); + + p3 = Unsafe.As>(ref Unsafe.Add(ref pRef, offset)); + p2 = Unsafe.As>(ref Unsafe.Add(ref pRef, offset + stride)); + Vector128 tmp1 = Unsafe.As>(ref Unsafe.Add(ref pRef, offset + (2 * stride))); + Vector128 tmp2 = Unsafe.As>(ref Unsafe.Add(ref pRef, offset + (3 * stride))); + + mask = Sse2.Max(mask, Abs(tmp1, tmp2)); + mask = Sse2.Max(mask, Abs(p3, p2)); + mask = Sse2.Max(mask, Abs(p2, tmp1)); + + // p3 and p2 are not just temporary variables here: they will be + // re-used for next span. And q2/q3 will become p1/p0 accordingly. + ComplexMask(p1, p0, p3, p2, thresh, ithresh, ref mask); + DoFilter4Sse2(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); + + // Store. + ref byte outputRef = ref MemoryMarshal.GetReference(b); + Unsafe.As>(ref outputRef) = p1.AsInt32(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, stride)) = p0.AsInt32(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, stride * 2)) = p3.AsInt32(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, stride * 3)) = p2.AsInt32(); + + // Rotate samples. + p1 = tmp1; + p0 = tmp2; + } + } + else +#endif + { + for (int k = 3; k > 0; k--) + { + offset += 4 * stride; + FilterLoop24(p, offset, stride, 1, 16, thresh, ithresh, hevThresh); + } } } public static void HFilter16i(Span p, int offset, int stride, int thresh, int ithresh, int hevThresh) { - for (int k = 3; k > 0; --k) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - offset += 4; - FilterLoop24(p, offset, 1, stride, 16, thresh, ithresh, hevThresh); + ref byte pRef = ref MemoryMarshal.GetReference(p); + Load16x4(ref Unsafe.Add(ref pRef, offset), ref Unsafe.Add(ref pRef, offset + (8 * stride)), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); + + Vector128 mask; + for (int k = 3; k > 0; k--) + { + // Beginning of p1. + ref byte bRef = ref Unsafe.Add(ref pRef, offset + 2); + + // Beginning of q0 (and next span). + offset += 4; + + // Compute partial mask. + mask = Abs(p1, p0); + mask = Sse2.Max(mask, Abs(p3, p2)); + mask = Sse2.Max(mask, Abs(p2, p1)); + + Load16x4(ref Unsafe.Add(ref pRef, offset), ref Unsafe.Add(ref pRef, offset + (8 * stride)), stride, out p3, out p2, out Vector128 tmp1, out Vector128 tmp2); + + mask = Sse2.Max(mask, Abs(tmp1, tmp2)); + mask = Sse2.Max(mask, Abs(p3, p2)); + mask = Sse2.Max(mask, Abs(p2, tmp1)); + + ComplexMask(p1, p0, p3, p2, thresh, ithresh, ref mask); + DoFilter4Sse2(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); + + Store16x4(p1, p0, p3, p2, ref bRef, ref Unsafe.Add(ref bRef, 8 * stride), stride); + + // Rotate samples. + p1 = tmp1; + p0 = tmp2; + } + } + else +#endif + { + for (int k = 3; k > 0; k--) + { + offset += 4; + FilterLoop24(p, offset, 1, stride, 16, thresh, ithresh, hevThresh); + } } } @@ -1279,31 +1512,167 @@ public static void HFilter16i(Span p, int offset, int stride, int thresh, [MethodImpl(InliningOptions.ShortMethod)] public static void VFilter8(Span u, Span v, int offset, int stride, int thresh, int ithresh, int hevThresh) { - FilterLoop26(u, offset, stride, 1, 8, thresh, ithresh, hevThresh); - FilterLoop26(v, offset, stride, 1, 8, thresh, ithresh, hevThresh); +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { + // Load uv h-edges. + ref byte uRef = ref MemoryMarshal.GetReference(u); + ref byte vRef = ref MemoryMarshal.GetReference(v); + Vector128 t1 = LoadUvEdge(ref uRef, ref vRef, offset - (4 * stride)); + Vector128 p2 = LoadUvEdge(ref uRef, ref vRef, offset - (3 * stride)); + Vector128 p1 = LoadUvEdge(ref uRef, ref vRef, offset - (2 * stride)); + Vector128 p0 = LoadUvEdge(ref uRef, ref vRef, offset - stride); + + Vector128 mask = Abs(p1, p0); + mask = Sse2.Max(mask, Abs(t1, p2)); + mask = Sse2.Max(mask, Abs(p2, p1)); + + Vector128 q0 = LoadUvEdge(ref uRef, ref vRef, offset); + Vector128 q1 = LoadUvEdge(ref uRef, ref vRef, offset + stride); + Vector128 q2 = LoadUvEdge(ref uRef, ref vRef, offset + (2 * stride)); + t1 = LoadUvEdge(ref uRef, ref vRef, offset + (3 * stride)); + + mask = Sse2.Max(mask, Abs(q1, q0)); + mask = Sse2.Max(mask, Abs(t1, q2)); + mask = Sse2.Max(mask, Abs(q2, q1)); + + ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); + + // Store. + StoreUv(p2, ref uRef, ref vRef, offset - (3 * stride)); + StoreUv(p1, ref uRef, ref vRef, offset - (2 * stride)); + StoreUv(p0, ref uRef, ref vRef, offset - stride); + StoreUv(q0, ref uRef, ref vRef, offset); + StoreUv(q1, ref uRef, ref vRef, offset + (1 * stride)); + StoreUv(q2, ref uRef, ref vRef, offset + (2 * stride)); + } + else +#endif + { + FilterLoop26(u, offset, stride, 1, 8, thresh, ithresh, hevThresh); + FilterLoop26(v, offset, stride, 1, 8, thresh, ithresh, hevThresh); + } } [MethodImpl(InliningOptions.ShortMethod)] public static void HFilter8(Span u, Span v, int offset, int stride, int thresh, int ithresh, int hevThresh) { - FilterLoop26(u, offset, 1, stride, 8, thresh, ithresh, hevThresh); - FilterLoop26(v, offset, 1, stride, 8, thresh, ithresh, hevThresh); +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { + ref byte uRef = ref MemoryMarshal.GetReference(u); + ref byte vRef = ref MemoryMarshal.GetReference(v); + Load16x4(ref Unsafe.Add(ref uRef, offset - 4), ref Unsafe.Add(ref vRef, offset - 4), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); + + Vector128 mask = Abs(p1, p0); + mask = Sse2.Max(mask, Abs(p3, p2)); + mask = Sse2.Max(mask, Abs(p2, p1)); + + Load16x4(ref Unsafe.Add(ref uRef, offset), ref Unsafe.Add(ref vRef, offset), stride, out Vector128 q0, out Vector128 q1, out Vector128 q2, out Vector128 q3); + + mask = Sse2.Max(mask, Abs(q1, q0)); + mask = Sse2.Max(mask, Abs(q3, q2)); + mask = Sse2.Max(mask, Abs(q2, q1)); + + ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); + + Store16x4(p3, p2, p1, p0, ref Unsafe.Add(ref uRef, offset - 4), ref Unsafe.Add(ref vRef, offset - 4), stride); + Store16x4(q0, q1, q2, q3, ref Unsafe.Add(ref uRef, offset), ref Unsafe.Add(ref vRef, offset), stride); + } + else +#endif + { + FilterLoop26(u, offset, 1, stride, 8, thresh, ithresh, hevThresh); + FilterLoop26(v, offset, 1, stride, 8, thresh, ithresh, hevThresh); + } } [MethodImpl(InliningOptions.ShortMethod)] public static void VFilter8i(Span u, Span v, int offset, int stride, int thresh, int ithresh, int hevThresh) { - int offset4mulstride = offset + (4 * stride); - FilterLoop24(u, offset4mulstride, stride, 1, 8, thresh, ithresh, hevThresh); - FilterLoop24(v, offset4mulstride, stride, 1, 8, thresh, ithresh, hevThresh); +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { + // Load uv h-edges. + ref byte uRef = ref MemoryMarshal.GetReference(u); + ref byte vRef = ref MemoryMarshal.GetReference(v); + Vector128 t2 = LoadUvEdge(ref uRef, ref vRef, offset); + Vector128 t1 = LoadUvEdge(ref uRef, ref vRef, offset + stride); + Vector128 p1 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 2)); + Vector128 p0 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 3)); + + Vector128 mask = Abs(p1, p0); + mask = Sse2.Max(mask, Abs(t2, t1)); + mask = Sse2.Max(mask, Abs(t1, p1)); + + offset += 4 * stride; + + Vector128 q0 = LoadUvEdge(ref uRef, ref vRef, offset); + Vector128 q1 = LoadUvEdge(ref uRef, ref vRef, offset + stride); + t1 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 2)); + t2 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 3)); + + mask = Sse2.Max(mask, Abs(q1, q0)); + mask = Sse2.Max(mask, Abs(t2, t1)); + mask = Sse2.Max(mask, Abs(t1, q1)); + + ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + DoFilter4Sse2(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); + + // Store. + StoreUv(p1, ref uRef, ref vRef, offset + (-2 * stride)); + StoreUv(p0, ref uRef, ref vRef, offset + (-1 * stride)); + StoreUv(q0, ref uRef, ref vRef, offset); + StoreUv(q1, ref uRef, ref vRef, offset + stride); + } + else +#endif + { + int offset4mulstride = offset + (4 * stride); + FilterLoop24(u, offset4mulstride, stride, 1, 8, thresh, ithresh, hevThresh); + FilterLoop24(v, offset4mulstride, stride, 1, 8, thresh, ithresh, hevThresh); + } } [MethodImpl(InliningOptions.ShortMethod)] public static void HFilter8i(Span u, Span v, int offset, int stride, int thresh, int ithresh, int hevThresh) { - int offsetPlus4 = offset + 4; - FilterLoop24(u, offsetPlus4, 1, stride, 8, thresh, ithresh, hevThresh); - FilterLoop24(v, offsetPlus4, 1, stride, 8, thresh, ithresh, hevThresh); +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { + ref byte uRef = ref MemoryMarshal.GetReference(u); + ref byte vRef = ref MemoryMarshal.GetReference(v); + Load16x4(ref Unsafe.Add(ref uRef, offset), ref Unsafe.Add(ref vRef, offset), stride, out Vector128 t2, out Vector128 t1, out Vector128 p1, out Vector128 p0); + + Vector128 mask = Abs(p1, p0); + mask = Sse2.Max(mask, Abs(t2, t1)); + mask = Sse2.Max(mask, Abs(t1, p1)); + + // Beginning of q0. + offset += 4; + + Load16x4(ref Unsafe.Add(ref uRef, offset), ref Unsafe.Add(ref vRef, offset), stride, out Vector128 q0, out Vector128 q1, out t1, out t2); + + mask = Sse2.Max(mask, Abs(q1, q0)); + mask = Sse2.Max(mask, Abs(t2, t1)); + mask = Sse2.Max(mask, Abs(t1, q1)); + + ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + DoFilter4Sse2(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); + + // Beginning of p1. + offset -= 2; + Store16x4(p1, p0, q0, q1, ref Unsafe.Add(ref uRef, offset), ref Unsafe.Add(ref vRef, offset), stride); + } + else +#endif + { + int offsetPlus4 = offset + 4; + FilterLoop24(u, offsetPlus4, 1, stride, 8, thresh, ithresh, hevThresh); + FilterLoop24(v, offsetPlus4, 1, stride, 8, thresh, ithresh, hevThresh); + } } public static void Mean16x4(Span input, Span dc) @@ -1460,6 +1829,7 @@ private static void FilterLoop26( } } + // Applies filter on 2 pixels (p0 and q0) private static void DoFilter2(Span p, int offset, int step) { // 4 pixels in, 2 pixels out. @@ -1474,6 +1844,143 @@ private static void DoFilter2(Span p, int offset, int step) p[offset] = WebpLookupTables.Clip1(q0 - a1); } +#if SUPPORTS_RUNTIME_INTRINSICS + // Applies filter on 2 pixels (p0 and q0) + private static void DoFilter2Sse2(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, int thresh) + { + // Convert p1/q1 to byte (for GetBaseDelta). + Vector128 p1s = Sse2.Xor(p1, SignBit); + Vector128 q1s = Sse2.Xor(q1, SignBit); + Vector128 mask = NeedsFilter(p1, p0, q0, q1, thresh); + + // Flip sign. + p0 = Sse2.Xor(p0, SignBit); + q0 = Sse2.Xor(q0, SignBit); + + Vector128 a = GetBaseDelta(p1s.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1s.AsSByte()).AsByte(); + + // Mask filter values we don't care about. + a = Sse2.And(a, mask); + + DoSimpleFilterSse2(ref p0, ref q0, a); + + // Flip sign. + p0 = Sse2.Xor(p0, SignBit); + q0 = Sse2.Xor(q0, SignBit); + } + + // Applies filter on 4 pixels (p1, p0, q0 and q1) + private static void DoFilter4Sse2(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, Vector128 mask, int tresh) + { + // Compute hev mask. + Vector128 notHev = GetNotHev(ref p1, ref p0, ref q0, ref q1, tresh); + + // Convert to signed values. + p1 = Sse2.Xor(p1, SignBit); + p0 = Sse2.Xor(p0, SignBit); + q0 = Sse2.Xor(q0, SignBit); + q1 = Sse2.Xor(q1, SignBit); + + Vector128 t1 = Sse2.SubtractSaturate(p1.AsSByte(), q1.AsSByte()); // p1 - q1 + t1 = Sse2.AndNot(notHev, t1.AsByte()).AsSByte(); // hev(p1 - q1) + Vector128 t2 = Sse2.SubtractSaturate(q0.AsSByte(), p0.AsSByte()); // q0 - p0 + t1 = Sse2.AddSaturate(t1, t2); // hev(p1 - q1) + 1 * (q0 - p0) + t1 = Sse2.AddSaturate(t1, t2); // hev(p1 - q1) + 2 * (q0 - p0) + t1 = Sse2.AddSaturate(t1, t2); // hev(p1 - q1) + 3 * (q0 - p0) + t1 = Sse2.And(t1.AsByte(), mask).AsSByte(); // mask filter values we don't care about. + + t2 = Sse2.AddSaturate(t1, Three); // 3 * (q0 - p0) + hev(p1 - q1) + 3 + Vector128 t3 = Sse2.AddSaturate(t1, FourSByte); // 3 * (q0 - p0) + hev(p1 - q1) + 4 + t2 = SignedShift8b(t2.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 + t3 = SignedShift8b(t3.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 + p0 = Sse2.AddSaturate(p0.AsSByte(), t2).AsByte(); // p0 += t2 + q0 = Sse2.SubtractSaturate(q0.AsSByte(), t3).AsByte(); // q0 -= t3 + p0 = Sse2.Xor(p0, SignBit); + q0 = Sse2.Xor(q0, SignBit); + + // This is equivalent to signed (a + 1) >> 1 calculation. + t2 = Sse2.Add(t3, SignBit.AsSByte()); + t3 = Sse2.Average(t2.AsByte(), Vector128.Zero).AsSByte(); + t3 = Sse2.Subtract(t3, SixtyFour); + + t3 = Sse2.And(notHev, t3.AsByte()).AsSByte(); // if !hev + q1 = Sse2.SubtractSaturate(q1.AsSByte(), t3).AsByte(); // q1 -= t3 + p1 = Sse2.AddSaturate(p1.AsSByte(), t3).AsByte(); // p1 += t3 + p1 = Sse2.Xor(p1.AsByte(), SignBit); + q1 = Sse2.Xor(q1.AsByte(), SignBit); + } + + // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) + private static void DoFilter6Sse2(ref Vector128 p2, ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, ref Vector128 q2, Vector128 mask, int tresh) + { + // Compute hev mask. + Vector128 notHev = GetNotHev(ref p1, ref p0, ref q0, ref q1, tresh); + + // Convert to signed values. + p1 = Sse2.Xor(p1, SignBit); + p0 = Sse2.Xor(p0, SignBit); + q0 = Sse2.Xor(q0, SignBit); + q1 = Sse2.Xor(q1, SignBit); + p2 = Sse2.Xor(p2, SignBit); + q2 = Sse2.Xor(q2, SignBit); + + Vector128 a = GetBaseDelta(p1.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1.AsSByte()); + + // Do simple filter on pixels with hev. + Vector128 m = Sse2.AndNot(notHev, mask); + Vector128 f = Sse2.And(a.AsByte(), m); + DoSimpleFilterSse2(ref p0, ref q0, f); + + // Do strong filter on pixels with not hev. + m = Sse2.And(notHev, mask); + f = Sse2.And(a.AsByte(), m); + Vector128 flow = Sse2.UnpackLow(Vector128.Zero, f); + Vector128 fhigh = Sse2.UnpackHigh(Vector128.Zero, f); + + Vector128 f9Low = Sse2.MultiplyHigh(flow.AsInt16(), Nine.AsInt16()); // Filter (lo) * 9 + Vector128 f9High = Sse2.MultiplyHigh(fhigh.AsInt16(), Nine.AsInt16()); // Filter (hi) * 9 + + Vector128 a2Low = Sse2.Add(f9Low, SixtyThree.AsInt16()); // Filter * 9 + 63 + Vector128 a2High = Sse2.Add(f9High, SixtyThree.AsInt16()); // Filter * 9 + 63 + + Vector128 a1Low = Sse2.Add(a2Low, f9Low); // Filter * 18 + 63 + Vector128 a1High = Sse2.Add(a2High, f9High); // // Filter * 18 + 63 + + Vector128 a0Low = Sse2.Add(a1Low, f9Low); // Filter * 27 + 63 + Vector128 a0High = Sse2.Add(a1High, f9High); // Filter * 27 + 63 + + Update2Pixels(ref p2, ref q2, a2Low, a2High); + Update2Pixels(ref p1, ref q1, a1Low, a1High); + Update2Pixels(ref p0, ref q0, a0Low, a0High); + } + + private static void DoSimpleFilterSse2(ref Vector128 p0, ref Vector128 q0, Vector128 fl) + { + Vector128 v3 = Sse2.AddSaturate(fl.AsSByte(), Three); + Vector128 v4 = Sse2.AddSaturate(fl.AsSByte(), FourSByte); + + v4 = SignedShift8b(v4.AsByte()).AsSByte(); // v4 >> 3 + v3 = SignedShift8b(v3.AsByte()).AsSByte(); // v3 >> 3 + q0 = Sse2.SubtractSaturate(q0.AsSByte(), v4).AsByte(); // q0 -= v4 + p0 = Sse2.AddSaturate(p0.AsSByte(), v3).AsByte(); // p0 += v3 + } + + private static Vector128 GetNotHev(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, int hevThresh) + { + Vector128 t1 = Abs(p1, p0); + Vector128 t2 = Abs(q1, q0); + + var h = Vector128.Create((byte)hevThresh); + Vector128 tMax = Sse2.Max(t1, t2); + + Vector128 tMaxH = Sse2.SubtractSaturate(tMax, h); + + // not_hev <= t1 && not_hev <= t2 + return Sse2.CompareEqual(tMaxH, Vector128.Zero); + } +#endif + + // Applies filter on 4 pixels (p1, p0, q0 and q1) private static void DoFilter4(Span p, int offset, int step) { // 4 pixels in, 4 pixels out. @@ -1492,6 +1999,7 @@ private static void DoFilter4(Span p, int offset, int step) p[offset + step] = WebpLookupTables.Clip1(q1 - a3); } + // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) private static void DoFilter6(Span p, int offset, int step) { // 6 pixels in, 6 pixels out. @@ -1550,6 +2058,201 @@ private static bool NeedsFilter2(Span p, int offset, int step, int t, int WebpLookupTables.Abs0(q2 - q1) <= it && WebpLookupTables.Abs0(q1 - q0) <= it; } +#if SUPPORTS_RUNTIME_INTRINSICS + private static Vector128 NeedsFilter(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, int thresh) + { + var mthresh = Vector128.Create((byte)thresh); + Vector128 t1 = Abs(p1, q1); // abs(p1 - q1) + var fe = Vector128.Create((byte)0xFE); + Vector128 t2 = Sse2.And(t1, fe); // set lsb of each byte to zero. + Vector128 t3 = Sse2.ShiftRightLogical(t2.AsInt16(), 1); // abs(p1 - q1) / 2 + + Vector128 t4 = Abs(p0, q0); // abs(p0 - q0) + Vector128 t5 = Sse2.AddSaturate(t4, t4); // abs(p0 - q0) * 2 + Vector128 t6 = Sse2.AddSaturate(t5.AsByte(), t3.AsByte()); // abs(p0-q0)*2 + abs(p1-q1)/2 + + Vector128 t7 = Sse2.SubtractSaturate(t6, mthresh.AsByte()); // mask <= m_thresh + + return Sse2.CompareEqual(t7, Vector128.Zero); + } + + private static void Load16x4(ref byte r0, ref byte r8, int stride, out Vector128 p1, out Vector128 p0, out Vector128 q0, out Vector128 q1) + { + // Assume the pixels around the edge (|) are numbered as follows + // 00 01 | 02 03 + // 10 11 | 12 13 + // ... | ... + // e0 e1 | e2 e3 + // f0 f1 | f2 f3 + // + // r0 is pointing to the 0th row (00) + // r8 is pointing to the 8th row (80) + + // Load + // p1 = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 + // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 + Load8x4(ref r0, stride, out Vector128 t1, out Vector128 t2); + Load8x4(ref r8, stride, out p0, out q1); + + // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + p1 = Sse2.UnpackLow(t1.AsInt64(), p0.AsInt64()).AsByte(); + p0 = Sse2.UnpackHigh(t1.AsInt64(), p0.AsInt64()).AsByte(); + q0 = Sse2.UnpackLow(t2.AsInt64(), q1.AsInt64()).AsByte(); + q1 = Sse2.UnpackHigh(t2.AsInt64(), q1.AsInt64()).AsByte(); + } + + // Reads 8 rows across a vertical edge. + private static void Load8x4(ref byte bRef, int stride, out Vector128 p, out Vector128 q) + { + // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00 + // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10 + uint a00 = Unsafe.As(ref Unsafe.Add(ref bRef, 6 * stride)); + uint a01 = Unsafe.As(ref Unsafe.Add(ref bRef, 2 * stride)); + uint a02 = Unsafe.As(ref Unsafe.Add(ref bRef, 4 * stride)); + uint a03 = Unsafe.As(ref Unsafe.Add(ref bRef, 0 * stride)); + Vector128 a0 = Vector128.Create(a03, a02, a01, a00).AsByte(); + uint a10 = Unsafe.As(ref Unsafe.Add(ref bRef, 7 * stride)); + uint a11 = Unsafe.As(ref Unsafe.Add(ref bRef, 3 * stride)); + uint a12 = Unsafe.As(ref Unsafe.Add(ref bRef, 5 * stride)); + uint a13 = Unsafe.As(ref Unsafe.Add(ref bRef, 1 * stride)); + Vector128 a1 = Vector128.Create(a13, a12, a11, a10).AsByte(); + + // B0 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 + // B1 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 + Vector128 b0 = Sse2.UnpackLow(a0.AsSByte(), a1.AsSByte()); + Vector128 b1 = Sse2.UnpackHigh(a0.AsSByte(), a1.AsSByte()); + + // C0 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + // C1 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 + Vector128 c0 = Sse2.UnpackLow(b0.AsInt16(), b1.AsInt16()); + Vector128 c1 = Sse2.UnpackHigh(b0.AsInt16(), b1.AsInt16()); + + // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + p = Sse2.UnpackLow(c0.AsInt32(), c1.AsInt32()).AsByte(); + q = Sse2.UnpackHigh(c0.AsInt32(), c1.AsInt32()).AsByte(); + } + + // Transpose back and store + private static void Store16x4(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, ref byte r0Ref, ref byte r8Ref, int stride) + { + // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + Vector128 p0s = Sse2.UnpackLow(p1, p0); + Vector128 p1s = Sse2.UnpackHigh(p1, p0); + + // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + Vector128 q0s = Sse2.UnpackLow(q0, q1); + Vector128 q1s = Sse2.UnpackHigh(q0, q1); + + // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + Vector128 t1 = p0s; + p0s = Sse2.UnpackLow(t1.AsInt16(), q0s.AsInt16()).AsByte(); + q0s = Sse2.UnpackHigh(t1.AsInt16(), q0s.AsInt16()).AsByte(); + + // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + t1 = p1s; + p1s = Sse2.UnpackLow(t1.AsInt16(), q1s.AsInt16()).AsByte(); + q1s = Sse2.UnpackHigh(t1.AsInt16(), q1s.AsInt16()).AsByte(); + + Store4x4(p0s, ref r0Ref, stride); + Store4x4(q0s, ref Unsafe.Add(ref r0Ref, 4 * stride), stride); + + Store4x4(p1s, ref r8Ref, stride); + Store4x4(q1s, ref Unsafe.Add(ref r8Ref, 4 * stride), stride); + } + + private static void Store4x4(Vector128 x, ref byte dstRef, int stride) + { + int offset = 0; + for (int i = 0; i < 4; i++) + { + Unsafe.As(ref Unsafe.Add(ref dstRef, offset)) = Sse2.ConvertToInt32(x.AsInt32()); + x = Sse2.ShiftRightLogical128BitLane(x, 4); + offset += stride; + } + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector128 GetBaseDelta(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1) + { + // Beware of addition order, for saturation! + Vector128 p1q1 = Sse2.SubtractSaturate(p1, q1); // p1 - q1 + Vector128 q0p0 = Sse2.SubtractSaturate(q0, p0); // q0 - p0 + Vector128 s1 = Sse2.AddSaturate(p1q1, q0p0); // p1 - q1 + 1 * (q0 - p0) + Vector128 s2 = Sse2.AddSaturate(q0p0, s1); // p1 - q1 + 2 * (q0 - p0) + Vector128 s3 = Sse2.AddSaturate(q0p0, s2); // p1 - q1 + 3 * (q0 - p0) + + return s3; + } + + // Shift each byte of "x" by 3 bits while preserving by the sign bit. + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector128 SignedShift8b(Vector128 x) + { + Vector128 low0 = Sse2.UnpackLow(Vector128.Zero, x); + Vector128 high0 = Sse2.UnpackHigh(Vector128.Zero, x); + Vector128 low1 = Sse2.ShiftRightArithmetic(low0.AsInt16(), 3 + 8); + Vector128 high1 = Sse2.ShiftRightArithmetic(high0.AsInt16(), 3 + 8); + + return Sse2.PackSignedSaturate(low1, high1); + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static void ComplexMask(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, int thresh, int ithresh, ref Vector128 mask) + { + var it = Vector128.Create((byte)ithresh); + Vector128 diff = Sse2.SubtractSaturate(mask, it); + Vector128 threshMask = Sse2.CompareEqual(diff, Vector128.Zero); + Vector128 filterMask = NeedsFilter(p1, p0, q0, q1, thresh); + + mask = Sse2.And(threshMask, filterMask); + } + + // Updates values of 2 pixels at MB edge during complex filtering. + // Update operations: + // q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)] + // Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip). + private static void Update2Pixels(ref Vector128 pi, ref Vector128 qi, Vector128 a0Low, Vector128 a0High) + { + Vector128 a1Low = Sse2.ShiftRightArithmetic(a0Low, 7); + Vector128 a1High = Sse2.ShiftRightArithmetic(a0High, 7); + Vector128 delta = Sse2.PackSignedSaturate(a1Low, a1High); + pi = Sse2.AddSaturate(pi.AsSByte(), delta).AsByte(); + qi = Sse2.SubtractSaturate(qi.AsSByte(), delta).AsByte(); + pi = Sse2.Xor(pi, SignBit.AsByte()); + qi = Sse2.Xor(qi, SignBit.AsByte()); + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector128 LoadUvEdge(ref byte uRef, ref byte vRef, int offset) + { + var uVec = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref uRef, offset)), 0); + var vVec = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref vRef, offset)), 0); + return Sse2.UnpackLow(uVec, vVec).AsByte(); + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static void StoreUv(Vector128 x, ref byte uRef, ref byte vRef, int offset) + { + Unsafe.As>(ref Unsafe.Add(ref uRef, offset)) = x.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref vRef, offset)) = x.GetUpper(); + } + + // Compute abs(p - q) = subs(p - q) OR subs(q - p) + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector128 Abs(Vector128 p, Vector128 q) + => Sse2.Or(Sse2.SubtractSaturate(q, p), Sse2.SubtractSaturate(p, q)); +#endif + [MethodImpl(InliningOptions.ShortMethod)] private static bool Hev(Span p, int offset, int step, int thresh) { @@ -1594,14 +2297,7 @@ private static void Put8x8uv(byte value, Span dst) } [MethodImpl(InliningOptions.ShortMethod)] - private static void Memset(Span dst, byte value, int startIdx, int count) - { - int end = startIdx + count; - for (int i = startIdx; i < end; i++) - { - dst[i] = value; - } - } + private static void Memset(Span dst, byte value, int startIdx, int count) => dst.Slice(startIdx, count).Fill(value); [MethodImpl(InliningOptions.ShortMethod)] private static int Clamp255(int x) => x < 0 ? 0 : x > 255 ? 255 : x; diff --git a/tests/ImageSharp.Tests/Formats/WebP/WebpDecoderTests.cs b/tests/ImageSharp.Tests/Formats/WebP/WebpDecoderTests.cs index 34fa72c63e..2cbeff2ceb 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/WebpDecoderTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/WebpDecoderTests.cs @@ -4,6 +4,7 @@ using System.IO; using SixLabors.ImageSharp.Formats.Webp; using SixLabors.ImageSharp.PixelFormats; +using SixLabors.ImageSharp.Tests.TestUtilities; using SixLabors.ImageSharp.Tests.TestUtilities.ReferenceCodecs; using Xunit; using static SixLabors.ImageSharp.Tests.TestImages.Webp; @@ -19,6 +20,10 @@ public class WebpDecoderTests private static MagickReferenceDecoder ReferenceDecoder => new(); + private static string TestImageLossySimpleFilterPath => Path.Combine(TestEnvironment.InputImagesDirectoryFullPath, Lossy.SimpleFilter02); + + private static string TestImageLossyComplexFilterPath => Path.Combine(TestEnvironment.InputImagesDirectoryFullPath, Lossy.BikeComplexFilter); + [Theory] [InlineData(Lossless.GreenTransform1, 1000, 307, 32)] [InlineData(Lossless.BikeThreeTransforms, 250, 195, 32)] @@ -359,5 +364,33 @@ public void WebpDecoder_ThrowImageFormatException_OnInvalidImages(TestIm { } }); + +#if SUPPORTS_RUNTIME_INTRINSICS + private static void RunDecodeLossyWithSimpleFilterTest() + { + var provider = TestImageProvider.File(TestImageLossySimpleFilterPath); + using (Image image = provider.GetImage(WebpDecoder)) + { + image.DebugSave(provider); + image.CompareToOriginal(provider, ReferenceDecoder); + } + } + + private static void RunDecodeLossyWithComplexFilterTest() + { + var provider = TestImageProvider.File(TestImageLossyComplexFilterPath); + using (Image image = provider.GetImage(WebpDecoder)) + { + image.DebugSave(provider); + image.CompareToOriginal(provider, ReferenceDecoder); + } + } + + [Fact] + public void DecodeLossyWithSimpleFilterTest_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunDecodeLossyWithSimpleFilterTest, HwIntrinsics.DisableHWIntrinsic); + + [Fact] + public void DecodeLossyWithComplexFilterTest_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunDecodeLossyWithComplexFilterTest, HwIntrinsics.DisableHWIntrinsic); +#endif } }