diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 0d33e91..3a29952 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -6,7 +6,7 @@ env: CARGO_TERM_COLOR: always jobs: - build: + x86: runs-on: ubuntu-20.04 strategy: matrix: @@ -77,3 +77,28 @@ jobs: env: RUSTFLAGS: -Ctarget-feature=+simd128,+bulk-memory,+nontrapping-fptoint,+sign-ext run: cargo test --target wasm32-wasi + + aarch64: + runs-on: ubuntu-20.04 + steps: + - name: Checkout + uses: actions/checkout@v2 + + - name: Install toolchain + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + target: aarch64-unknown-linux-gnu + + - name: Install cross + run: cargo install cross + + - name: Build with minimal features (no_std) + run: cross build --target aarch64-unknown-linux-gnu --verbose --no-default-features --features no-std-float + + - name: Run tests without SIMD + run: cross test --target aarch64-unknown-linux-gnu --verbose --no-default-features --features png-format + + - name: Run tests with Neon + run: cross test --target aarch64-unknown-linux-gnu diff --git a/Cargo.toml b/Cargo.toml index b4a9c8b..6669b3e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,8 +30,9 @@ default = ["std", "simd", "png-format"] std = ["tiny-skia-path/std"] no-std-float = ["tiny-skia-path/no-std-float"] -# Enables x86 SIMD instructions from SSE up to AVX2. -# Has no effect on non-x86 targets. Present mainly for testing. +# Enables SIMD instructions on x86 (from SSE up to AVX2), WebAssembly (SIMD128) +# and Aarch64 (Neon). +# Has no effect other targets. Present mainly for testing. simd = [] # Allows loading and saving `Pixmap` as PNG. diff --git a/path/src/f32x2_t.rs b/path/src/f32x2_t.rs index ae700b7..e9f9d09 100644 --- a/path/src/f32x2_t.rs +++ b/path/src/f32x2_t.rs @@ -36,22 +36,22 @@ impl f32x2 { /// Returns a minimum value. pub fn min(self, other: f32x2) -> f32x2 { f32x2([ - self.x().min(other.x()), - self.y().min(other.y()), + pmin(self.x(), other.x()), + pmin(self.y(), other.y()), ]) } /// Returns a maximum value. pub fn max(self, other: f32x2) -> f32x2 { f32x2([ - self.x().max(other.x()), - self.y().max(other.y()), + pmax(self.x(), other.x()), + pmax(self.y(), other.y()), ]) } /// Returns a maximum of both values. pub fn max_component(self) -> f32 { - self.x().max(self.y()) + pmax(self.x(), self.y()) } /// Returns the first value. @@ -104,3 +104,11 @@ impl core::ops::Div for f32x2 { ]) } } + +fn pmax(a: f32, b: f32) -> f32 { + if a < b { b } else { a } +} + +fn pmin(a: f32, b: f32) -> f32 { + if b < a { b } else { a } +} diff --git a/src/pipeline/highp.rs b/src/pipeline/highp.rs index bdbde35..5a27ca7 100644 --- a/src/pipeline/highp.rs +++ b/src/pipeline/highp.rs @@ -416,7 +416,7 @@ blend_fn2!(color_burn, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8| d + s * inv(da), s.cmp_eq(f32x8::default()).blend( d * inv(sa), - sa * (da - da.min((da - d) * sa * s.recip())) + s * inv(da) + d * inv(sa) + sa * (da - da.min((da - d) * sa * s.recip_fast())) + s * inv(da) + d * inv(sa) ) ) ); @@ -426,7 +426,7 @@ blend_fn2!(color_dodge, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8| s * inv(da), s.cmp_eq(sa).blend( s + d * inv(sa), - sa * da.min((d * sa) * (sa - s).recip()) + s * inv(da) + d * inv(sa) + sa * da.min((d * sa) * (sa - s).recip_fast()) + s * inv(da) + d * inv(sa) ) ) ); @@ -456,7 +456,7 @@ blend_fn2!(soft_light, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8| { // 3. light src, light dst? let dark_src = d * (sa + (s2 - sa) * (f32x8::splat(1.0) - m)); let dark_dst = (m4 * m4 + m4) * (m - f32x8::splat(1.0)) + f32x8::splat(7.0) * m; - let lite_dst = m.recip_sqrt().recip() - m; + let lite_dst = m.sqrt() - m; let lite_src = d * sa + da * (s2 - sa) * two(two(d)).cmp_le(da).blend(dark_dst, lite_dst); // 2 or 3? diff --git a/src/wide/f32x16_t.rs b/src/wide/f32x16_t.rs index 17d1a46..3ddf2df 100644 --- a/src/wide/f32x16_t.rs +++ b/src/wide/f32x16_t.rs @@ -74,7 +74,7 @@ impl f32x16 { pub fn floor(&self) -> Self { // Yes, Skia does it in the same way. - let roundtrip = self.round_int(); + let roundtrip = self.round(); roundtrip - roundtrip.cmp_gt(self).blend(f32x16::splat(1.0), f32x16::splat(0.0)) } @@ -85,10 +85,10 @@ impl f32x16 { ]) } - pub fn round_int(&self) -> Self { + pub fn round(&self) -> Self { Self([ - self.0[0].round_int().to_f32x8(), - self.0[1].round_int().to_f32x8(), + self.0[0].round(), + self.0[1].round(), ]) } diff --git a/src/wide/f32x4_t.rs b/src/wide/f32x4_t.rs index 6190f15..4f9db39 100644 --- a/src/wide/f32x4_t.rs +++ b/src/wide/f32x4_t.rs @@ -24,6 +24,12 @@ cfg_if::cfg_if! { #[derive(Clone, Copy, Debug)] #[repr(transparent)] pub struct f32x4(v128); + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + use core::arch::aarch64::*; + + #[derive(Clone, Copy, Debug)] + #[repr(C, align(16))] + pub struct f32x4(float32x4_t); } else { #[derive(Clone, Copy, Debug)] #[repr(C, align(16))] @@ -40,34 +46,46 @@ impl f32x4 { } pub fn max(self, rhs: Self) -> Self { + // These technically don't have the same semantics for NaN and 0, but it + // doesn't seem to matter as Skia does it the same way. cfg_if::cfg_if! { if #[cfg(all(feature = "simd", target_feature = "sse2"))] { Self(unsafe { _mm_max_ps(self.0, rhs.0) }) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_max(self.0, rhs.0)) + Self(f32x4_pmax(self.0, rhs.0)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vmaxq_f32(self.0, rhs.0)) + } } else { Self([ - self.0[0].max(rhs.0[0]), - self.0[1].max(rhs.0[1]), - self.0[2].max(rhs.0[2]), - self.0[3].max(rhs.0[3]), + super::pmax(self.0[0], rhs.0[0]), + super::pmax(self.0[1], rhs.0[1]), + super::pmax(self.0[2], rhs.0[2]), + super::pmax(self.0[3], rhs.0[3]), ]) } } } pub fn min(self, rhs: Self) -> Self { + // These technically don't have the same semantics for NaN and 0, but it + // doesn't seem to matter as Skia does it the same way. cfg_if::cfg_if! { if #[cfg(all(feature = "simd", target_feature = "sse2"))] { Self(unsafe { _mm_min_ps(self.0, rhs.0) }) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_min(self.0, rhs.0)) + Self(f32x4_pmin(self.0, rhs.0)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vminq_f32(self.0, rhs.0)) + } } else { Self([ - self.0[0].min(rhs.0[0]), - self.0[1].min(rhs.0[1]), - self.0[2].min(rhs.0[2]), - self.0[3].min(rhs.0[3]), + super::pmin(self.0[0], rhs.0[0]), + super::pmin(self.0[1], rhs.0[1]), + super::pmin(self.0[2], rhs.0[2]), + super::pmin(self.0[3], rhs.0[3]), ]) } } @@ -95,6 +113,10 @@ impl core::ops::Add for f32x4 { Self(unsafe { _mm_add_ps(self.0, rhs.0) }) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_add(self.0, rhs.0)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vaddq_f32(self.0, rhs.0)) + } } else { Self([ self.0[0] + rhs.0[0], @@ -122,6 +144,10 @@ impl core::ops::Sub for f32x4 { Self(unsafe { _mm_sub_ps(self.0, rhs.0) }) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_sub(self.0, rhs.0)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vsubq_f32(self.0, rhs.0)) + } } else { Self([ self.0[0] - rhs.0[0], @@ -143,6 +169,10 @@ impl core::ops::Mul for f32x4 { Self(unsafe { _mm_mul_ps(self.0, rhs.0) }) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_mul(self.0, rhs.0)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vmulq_f32(self.0, rhs.0)) + } } else { Self([ self.0[0] * rhs.0[0], diff --git a/src/wide/f32x8_t.rs b/src/wide/f32x8_t.rs index e9398dc..daa88dc 100644 --- a/src/wide/f32x8_t.rs +++ b/src/wide/f32x8_t.rs @@ -32,6 +32,12 @@ cfg_if::cfg_if! { #[derive(Clone, Copy, Debug)] #[repr(C, align(32))] pub struct f32x8(v128, v128); + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + use core::arch::aarch64::*; + + #[derive(Clone, Copy, Debug)] + #[repr(C, align(32))] + pub struct f32x8(float32x4_t, float32x4_t); } else { #[derive(Clone, Copy, Debug)] #[repr(C, align(32))] @@ -57,6 +63,10 @@ impl f32x8 { cfg_if::cfg_if! { if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_floor(self.0), f32x4_floor(self.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vrndmq_f32(self.0), vrndmq_f32(self.1)) + } } else { let roundtrip: f32x8 = cast(self.trunc_int().to_f32x8()); roundtrip - roundtrip.cmp_gt(self).blend(f32x8::splat(1.0), f32x8::default()) @@ -91,6 +101,13 @@ impl f32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_eq(self.0, rhs.0), f32x4_eq(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self( + core::mem::transmute(vceqq_f32(self.0, rhs.0)), + core::mem::transmute(vceqq_f32(self.1, rhs.1)), + ) + } } else { Self(impl_x8_cmp!(self, eq, rhs, f32::from_bits(u32::MAX), 0.0)) } @@ -108,6 +125,13 @@ impl f32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_ge(self.0, rhs.0), f32x4_ge(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self( + core::mem::transmute(vcgeq_f32(self.0, rhs.0)), + core::mem::transmute(vcgeq_f32(self.1, rhs.1)), + ) + } } else { Self(impl_x8_cmp!(self, ge, rhs, f32::from_bits(u32::MAX), 0.0)) } @@ -125,6 +149,13 @@ impl f32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_gt(self.0, rhs.0), f32x4_gt(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self( + core::mem::transmute(vcgtq_f32(self.0, rhs.0)), + core::mem::transmute(vcgtq_f32(self.1, rhs.1)), + ) + } } else { Self(impl_x8_cmp!(self, gt, rhs, f32::from_bits(u32::MAX), 0.0)) } @@ -142,6 +173,13 @@ impl f32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_ne(self.0, rhs.0), f32x4_ne(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self( + core::mem::transmute(vmvnq_u32(vceqq_f32(self.0, rhs.0))), + core::mem::transmute(vmvnq_u32(vceqq_f32(self.1, rhs.1))), + ) + } } else { Self(impl_x8_cmp!(self, ne, rhs, f32::from_bits(u32::MAX), 0.0)) } @@ -159,6 +197,13 @@ impl f32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_le(self.0, rhs.0), f32x4_le(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self( + core::mem::transmute(vcleq_f32(self.0, rhs.0)), + core::mem::transmute(vcleq_f32(self.1, rhs.1)), + ) + } } else { Self(impl_x8_cmp!(self, le, rhs, f32::from_bits(u32::MAX), 0.0)) } @@ -176,6 +221,13 @@ impl f32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_lt(self.0, rhs.0), f32x4_lt(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self( + core::mem::transmute(vcltq_f32(self.0, rhs.0)), + core::mem::transmute(vcltq_f32(self.1, rhs.1)), + ) + } } else { Self(impl_x8_cmp!(self, lt, rhs, f32::from_bits(u32::MAX), 0.0)) } @@ -194,6 +246,21 @@ impl f32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(v128_bitselect(t.0, f.0, self.0), v128_bitselect(t.1, f.1, self.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self( + core::mem::transmute(vbslq_u32( + core::mem::transmute(self.0), + core::mem::transmute(t.0), + core::mem::transmute(f.0), + )), + core::mem::transmute(vbslq_u32( + core::mem::transmute(self.1), + core::mem::transmute(t.1), + core::mem::transmute(f.1), + )), + ) + } } else { super::generic_bit_blend(self, t, f) } @@ -204,6 +271,10 @@ impl f32x8 { cfg_if::cfg_if! { if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_abs(self.0), f32x4_abs(self.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vabsq_f32(self.0), vabsq_f32(self.1)) + } } else { let non_sign_bits = f32x8::splat(f32::from_bits(i32::MAX as u32)); self & non_sign_bits @@ -212,6 +283,8 @@ impl f32x8 { } pub fn max(self, rhs: Self) -> Self { + // These technically don't have the same semantics for NaN and 0, but it + // doesn't seem to matter as Skia does it the same way. cfg_if::cfg_if! { if #[cfg(all(feature = "simd", target_feature = "avx"))] { Self(unsafe { _mm256_max_ps(self.0, rhs.0) }) @@ -221,14 +294,29 @@ impl f32x8 { unsafe { _mm_max_ps(self.1, rhs.1) }, ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_max(self.0, rhs.0), f32x4_max(self.1, rhs.1)) + Self(f32x4_pmax(self.0, rhs.0), f32x4_pmax(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vmaxq_f32(self.0, rhs.0), vmaxq_f32(self.1, rhs.1)) + } } else { - Self(impl_x8_op!(self, max, rhs)) + Self([ + super::pmax(self.0[0], rhs.0[0]), + super::pmax(self.0[1], rhs.0[1]), + super::pmax(self.0[2], rhs.0[2]), + super::pmax(self.0[3], rhs.0[3]), + super::pmax(self.0[4], rhs.0[4]), + super::pmax(self.0[5], rhs.0[5]), + super::pmax(self.0[6], rhs.0[6]), + super::pmax(self.0[7], rhs.0[7]), + ]) } } } pub fn min(self, rhs: Self) -> Self { + // These technically don't have the same semantics for NaN and 0, but it + // doesn't seem to matter as Skia does it the same way. cfg_if::cfg_if! { if #[cfg(all(feature = "simd", target_feature = "avx"))] { Self(unsafe { _mm256_min_ps(self.0, rhs.0) }) @@ -238,9 +326,22 @@ impl f32x8 { unsafe { _mm_min_ps(self.1, rhs.1) }, ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(f32x4_min(self.0, rhs.0), f32x4_min(self.1, rhs.1)) + Self(f32x4_pmin(self.0, rhs.0), f32x4_pmin(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vminq_f32(self.0, rhs.0), vminq_f32(self.1, rhs.1)) + } } else { - Self(impl_x8_op!(self, min, rhs)) + Self([ + super::pmin(self.0[0], rhs.0[0]), + super::pmin(self.0[1], rhs.0[1]), + super::pmin(self.0[2], rhs.0[2]), + super::pmin(self.0[3], rhs.0[3]), + super::pmin(self.0[4], rhs.0[4]), + super::pmin(self.0[5], rhs.0[5]), + super::pmin(self.0[6], rhs.0[6]), + super::pmin(self.0[7], rhs.0[7]), + ]) } } } @@ -248,7 +349,7 @@ impl f32x8 { pub fn is_finite(self) -> Self { let shifted_exp_mask = u32x8::splat(0xFF000000); let u: u32x8 = cast(self); - let shift_u = u << 1; + let shift_u = u.shl::<1>(); let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask); cast(out) } @@ -264,10 +365,14 @@ impl f32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_nearest(self.0), f32x4_nearest(self.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vrndnq_f32(self.0), vrndnq_f32(self.1)) + } } else { let to_int = f32x8::splat(1.0 / f32::EPSILON); let u: u32x8 = cast(self); - let e: i32x8 = cast((u >> 23) & u32x8::splat(0xff)); + let e: i32x8 = cast(u.shr::<23>() & u32x8::splat(0xff)); let mut y: f32x8; let no_op_magic = i32x8::splat(0x7f + 23); @@ -293,6 +398,9 @@ impl f32x8 { } pub fn round_int(self) -> i32x8 { + // These technically don't have the same semantics for NaN and out of + // range values, but it doesn't seem to matter as Skia does it the same + // way. cfg_if::cfg_if! { if #[cfg(all(feature = "simd", target_feature = "avx"))] { cast(unsafe { _mm256_cvtps_epi32(self.0) }) @@ -304,9 +412,13 @@ impl f32x8 { } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { let rounded = self.round(); i32x8(i32x4_trunc_sat_f32x4(rounded.0), i32x4_trunc_sat_f32x4(rounded.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + i32x8(vcvtnq_s32_f32(self.0), vcvtnq_s32_f32(self.1)) + } } else { let rounded: [f32; 8] = cast(self.round()); - let rounded_ints: i32x8 = cast([ + cast([ rounded[0] as i32, rounded[1] as i32, rounded[2] as i32, @@ -315,16 +427,15 @@ impl f32x8 { rounded[5] as i32, rounded[6] as i32, rounded[7] as i32, - ]); - cast::(self.is_finite()).blend( - rounded_ints, - i32x8::splat(i32::MIN) - ) + ]) } } } pub fn trunc_int(self) -> i32x8 { + // These technically don't have the same semantics for NaN and out of + // range values, but it doesn't seem to matter as Skia does it the same + // way. cfg_if::cfg_if! { if #[cfg(all(feature = "simd", target_feature = "avx"))] { cast(unsafe { _mm256_cvttps_epi32(self.0) }) @@ -334,28 +445,28 @@ impl f32x8 { unsafe { _mm_cvttps_epi32(self.1) }, ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - cast(Self( - i32x4_trunc_sat_f32x4(self.0), - i32x4_trunc_sat_f32x4(self.1), - )) + i32x8(i32x4_trunc_sat_f32x4(self.0), i32x4_trunc_sat_f32x4(self.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + i32x8(vcvtq_s32_f32(self.0), vcvtq_s32_f32(self.1)) + } } else { let n: [f32; 8] = cast(self); - let ints: i32x8 = cast([ - n[0].trunc() as i32, - n[1].trunc() as i32, - n[2].trunc() as i32, - n[3].trunc() as i32, - n[4].trunc() as i32, - n[5].trunc() as i32, - n[6].trunc() as i32, - n[7].trunc() as i32, - ]); - cast::(self.is_finite()).blend(ints,i32x8::splat(i32::MIN)) + cast([ + n[0] as i32, + n[1] as i32, + n[2] as i32, + n[3] as i32, + n[4] as i32, + n[5] as i32, + n[6] as i32, + n[7] as i32, + ]) } } } - pub fn recip(self) -> Self { + pub fn recip_fast(self) -> Self { cfg_if::cfg_if! { if #[cfg(all(feature = "simd", target_feature = "avx"))] { Self(unsafe { _mm256_rcp_ps(self.0) }) @@ -370,6 +481,16 @@ impl f32x8 { f32x4_div(one, self.0), f32x4_div(one, self.1), ) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + let a = vrecpeq_f32(self.0); + let a = vmulq_f32(vrecpsq_f32(self.0, a), a); + + let b = vrecpeq_f32(self.1); + let b = vmulq_f32(vrecpsq_f32(self.1, b), b); + + Self(a, b) + } } else { Self::from([ 1.0 / self.0[0], @@ -400,6 +521,16 @@ impl f32x8 { f32x4_div(one, f32x4_sqrt(self.0)), f32x4_div(one, f32x4_sqrt(self.1)), ) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + let a = vrsqrteq_f32(self.0); + let a = vmulq_f32(vrsqrtsq_f32(self.0, vmulq_f32(a, a)), a); + + let b = vrsqrteq_f32(self.1); + let b = vmulq_f32(vrsqrtsq_f32(self.1, vmulq_f32(b, b)), b); + + Self(a, b) + } } else { Self::from([ 1.0 / self.0[0].sqrt(), @@ -426,6 +557,10 @@ impl f32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_sqrt(self.0), f32x4_sqrt(self.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vsqrtq_f32(self.0), vsqrtq_f32(self.1)) + } } else { Self::from([ self.0[0].sqrt(), @@ -468,6 +603,10 @@ impl core::ops::Add for f32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_add(self.0, rhs.0), f32x4_add(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vaddq_f32(self.0, rhs.0), vaddq_f32(self.1, rhs.1)) + } } else { Self(impl_x8_op!(self, add, rhs)) } @@ -495,6 +634,10 @@ impl core::ops::Sub for f32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_sub(self.0, rhs.0), f32x4_sub(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vsubq_f32(self.0, rhs.0), vsubq_f32(self.1, rhs.1)) + } } else { Self(impl_x8_op!(self, sub, rhs)) } @@ -516,6 +659,10 @@ impl core::ops::Mul for f32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_mul(self.0, rhs.0), f32x4_mul(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vmulq_f32(self.0, rhs.0), vmulq_f32(self.1, rhs.1)) + } } else { Self(impl_x8_op!(self, mul, rhs)) } @@ -543,6 +690,10 @@ impl core::ops::Div for f32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_div(self.0, rhs.0), f32x4_div(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vdivq_f32(self.0, rhs.0), vdivq_f32(self.1, rhs.1)) + } } else { Self(impl_x8_op!(self, div, rhs)) } @@ -565,6 +716,19 @@ impl core::ops::BitAnd for f32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(v128_and(self.0, rhs.0), v128_and(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self( + core::mem::transmute(vandq_u32( + core::mem::transmute(self.0), + core::mem::transmute(rhs.0), + )), + core::mem::transmute(vandq_u32( + core::mem::transmute(self.1), + core::mem::transmute(rhs.1), + )), + ) + } } else { Self([ f32::from_bits(self.0[0].to_bits() & rhs.0[0].to_bits()), @@ -596,6 +760,19 @@ impl core::ops::BitOr for f32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(v128_or(self.0, rhs.0), v128_or(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self( + core::mem::transmute(vorrq_u32( + core::mem::transmute(self.0), + core::mem::transmute(rhs.0), + )), + core::mem::transmute(vorrq_u32( + core::mem::transmute(self.1), + core::mem::transmute(rhs.1), + )), + ) + } } else { Self([ f32::from_bits(self.0[0].to_bits() | rhs.0[0].to_bits()), @@ -627,6 +804,19 @@ impl core::ops::BitXor for f32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(v128_xor(self.0, rhs.0), v128_xor(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self( + core::mem::transmute(veorq_u32( + core::mem::transmute(self.0), + core::mem::transmute(rhs.0), + )), + core::mem::transmute(veorq_u32( + core::mem::transmute(self.1), + core::mem::transmute(rhs.1), + )), + ) + } } else { Self([ f32::from_bits(self.0[0].to_bits() ^ rhs.0[0].to_bits()), @@ -666,6 +856,13 @@ impl core::ops::Not for f32x8 { } } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(v128_not(self.0), v128_not(self.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self( + core::mem::transmute(vmvnq_u32(core::mem::transmute(self.0))), + core::mem::transmute(vmvnq_u32(core::mem::transmute(self.1))), + ) + } } else { self ^ Self::splat(cast(u32::MAX)) } @@ -681,6 +878,13 @@ impl core::cmp::PartialEq for f32x8 { unsafe { _mm256_movemask_ps(mask) == 0b1111_1111 } } else if #[cfg(all(feature = "simd", target_feature = "sse2"))] { unsafe { _mm_movemask_ps(_mm_cmpeq_ps(self.0, rhs.0)) == 0b1111 } + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + vminvq_u32(vandq_u32( + vceqq_f32(self.0, rhs.0), + vceqq_f32(self.1, rhs.1), + )) != 0 + } } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { u32x4_all_true(f32x4_eq(self.0, rhs.0)) & u32x4_all_true(f32x4_eq(self.1, rhs.1)) diff --git a/src/wide/i32x8_t.rs b/src/wide/i32x8_t.rs index ccee170..dd3c769 100644 --- a/src/wide/i32x8_t.rs +++ b/src/wide/i32x8_t.rs @@ -29,6 +29,12 @@ cfg_if::cfg_if! { #[derive(Clone, Copy, Debug)] #[repr(C, align(32))] pub struct i32x8(pub v128, pub v128); + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + use core::arch::aarch64::*; + + #[derive(Clone, Copy, Debug)] + #[repr(C, align(32))] + pub struct i32x8(pub int32x4_t, pub int32x4_t); } else { #[derive(Clone, Copy, Debug)] #[repr(C, align(32))] @@ -61,6 +67,13 @@ impl i32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(v128_bitselect(t.0, f.0, self.0), v128_bitselect(t.1, f.1, self.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self( + vbslq_s32(core::mem::transmute(self.0), t.0, f.0), + vbslq_s32(core::mem::transmute(self.1), t.1, f.1), + ) + } } else { super::generic_bit_blend(self, t, f) } @@ -78,6 +91,13 @@ impl i32x8 { )) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(i32x4_eq(self.0, rhs.0), i32x4_eq(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self( + core::mem::transmute(vceqq_s32(self.0, rhs.0)), + core::mem::transmute(vceqq_s32(self.1, rhs.1)), + ) + } } else { Self(impl_x8_cmp!(self, eq, rhs, -1, 0)) } @@ -95,6 +115,13 @@ impl i32x8 { )) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(i32x4_gt(self.0, rhs.0), i32x4_eq(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self( + core::mem::transmute(vcgtq_s32(self.0, rhs.0)), + core::mem::transmute(vcgtq_s32(self.1, rhs.1)), + ) + } } else { Self(impl_x8_cmp!(self, gt, rhs, -1, 0)) } @@ -116,6 +143,13 @@ impl i32x8 { )) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(i32x4_lt(self.0, rhs.0), i32x4_lt(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self( + core::mem::transmute(vcltq_s32(self.0, rhs.0)), + core::mem::transmute(vcltq_s32(self.1, rhs.1)), + ) + } } else { Self(impl_x8_cmp!(self, lt, rhs, -1, 0)) } @@ -133,6 +167,13 @@ impl i32x8 { )) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { cast(Self(f32x4_convert_i32x4(self.0), f32x4_convert_i32x4(self.1))) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + cast(Self( + core::mem::transmute(vcvtq_f32_s32(self.0)), + core::mem::transmute(vcvtq_f32_s32(self.1)), + )) + } } else { let arr: [i32; 8] = cast(self); cast([ @@ -184,6 +225,10 @@ impl core::ops::Add for i32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(i32x4_add(self.0, rhs.0), i32x4_add(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vaddq_s32(self.0, rhs.0), vaddq_s32(self.1, rhs.1)) + } } else { Self(impl_x8_op!(self, wrapping_add, rhs)) } @@ -205,6 +250,10 @@ impl core::ops::BitAnd for i32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(v128_and(self.0, rhs.0), v128_and(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vandq_s32(self.0, rhs.0), vandq_s32(self.1, rhs.1)) + } } else { Self(impl_x8_op!(self, bitand, rhs)) } @@ -226,6 +275,10 @@ impl core::ops::Mul for i32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(i32x4_mul(self.0, rhs.0), i32x4_mul(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vmulq_s32(self.0, rhs.0), vmulq_s32(self.1, rhs.1)) + } } else { struct Dummy([i32; 8]); let arr1: [i32; 8] = cast(self); @@ -251,6 +304,10 @@ impl core::ops::BitOr for i32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(v128_or(self.0, rhs.0), v128_or(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vorrq_s32(self.0, rhs.0), vorrq_s32(self.1, rhs.1)) + } } else { Self(impl_x8_op!(self, bitor, rhs)) } @@ -273,6 +330,10 @@ impl core::ops::BitXor for i32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(v128_xor(self.0, rhs.0), v128_xor(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(veorq_s32(self.0, rhs.0), veorq_s32(self.1, rhs.1)) + } } else { Self(impl_x8_op!(self, bitxor, rhs)) } diff --git a/src/wide/mod.rs b/src/wide/mod.rs index 4b5afd4..d660ca8 100644 --- a/src/wide/mod.rs +++ b/src/wide/mod.rs @@ -59,6 +59,16 @@ pub use u32x8_t::u32x8; pub use f32x16_t::f32x16; pub use u16x16_t::u16x16; +#[allow(dead_code)] +fn pmax(a: f32, b: f32) -> f32 { + if a < b { b } else { a } +} + +#[allow(dead_code)] +fn pmin(a: f32, b: f32) -> f32 { + if b < a { b } else { a } +} + #[allow(dead_code)] #[inline] pub fn generic_bit_blend(mask: T, y: T, n: T) -> T diff --git a/src/wide/u32x8_t.rs b/src/wide/u32x8_t.rs index ba22f66..b5c1570 100644 --- a/src/wide/u32x8_t.rs +++ b/src/wide/u32x8_t.rs @@ -31,6 +31,12 @@ cfg_if::cfg_if! { #[derive(Clone, Copy, Debug)] #[repr(C, align(32))] pub struct u32x8(v128, v128); + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + use core::arch::aarch64::*; + + #[derive(Clone, Copy, Debug)] + #[repr(C, align(32))] + pub struct u32x8(uint32x4_t, uint32x4_t); } else { #[derive(Clone, Copy, Debug)] #[repr(C, align(32))] @@ -71,11 +77,81 @@ impl u32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(u32x4_eq(self.0, rhs.0), u32x4_eq(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vceqq_u32(self.0, rhs.0), vceqq_u32(self.1, rhs.1)) + } } else { Self(impl_x8_cmp!(self, eq, rhs, u32::MAX, 0)) } } } + + pub fn shl(self) -> Self { + cfg_if::cfg_if! { + if #[cfg(all(feature = "simd", target_feature = "avx2"))] { + let shift: __m128i = cast([RHS as u64, 0]); + Self(unsafe { _mm256_sll_epi32(self.0, shift) }) + } else if #[cfg(all(feature = "simd", target_feature = "sse2"))] { + let shift = cast([RHS as u64, 0]); + Self( + unsafe { _mm_sll_epi32(self.0, shift) }, + unsafe { _mm_sll_epi32(self.1, shift) }, + ) + } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { + Self(u32x4_shl(self.0, RHS as _), u32x4_shl(self.1, RHS as _)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vshlq_n_u32::(self.0), vshlq_n_u32::(self.1)) + } + } else { + let u = RHS as u64; + Self([ + self.0[0] << u, + self.0[1] << u, + self.0[2] << u, + self.0[3] << u, + self.0[4] << u, + self.0[5] << u, + self.0[6] << u, + self.0[7] << u, + ]) + } + } + } + + pub fn shr(self) -> Self { + cfg_if::cfg_if! { + if #[cfg(all(feature = "simd", target_feature = "avx2"))] { + let shift: __m128i = cast([RHS as u64, 0]); + Self(unsafe { _mm256_srl_epi32(self.0, shift) }) + } else if #[cfg(all(feature = "simd", target_feature = "sse2"))] { + let shift: __m128i = cast([RHS as u64, 0]); + Self( + unsafe { _mm_srl_epi32(self.0, shift) }, + unsafe { _mm_srl_epi32(self.1, shift) }, + ) + } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { + Self(u32x4_shr(self.0, RHS as _), u32x4_shr(self.1, RHS as _)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vshrq_n_u32::(self.0), vshrq_n_u32::(self.1)) + } + } else { + let u = RHS as u64; + Self([ + self.0[0] >> u, + self.0[1] >> u, + self.0[2] >> u, + self.0[3] >> u, + self.0[4] >> u, + self.0[5] >> u, + self.0[6] >> u, + self.0[7] >> u, + ]) + } + } + } } impl core::ops::Not for u32x8 { @@ -94,6 +170,10 @@ impl core::ops::Not for u32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(v128_not(self.0), v128_not(self.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vmvnq_u32(self.0), vmvnq_u32(self.1)) + } } else { Self([ !self.0[0], @@ -124,6 +204,10 @@ impl core::ops::Add for u32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(u32x4_add(self.0, rhs.0), u32x4_add(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vaddq_u32(self.0, rhs.0), vaddq_u32(self.1, rhs.1)) + } } else { Self(impl_x8_op!(self, wrapping_add, rhs)) } @@ -145,75 +229,13 @@ impl core::ops::BitAnd for u32x8 { ) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(v128_and(self.0, rhs.0), v128_and(self.1, rhs.1)) + } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { + unsafe { + Self(vandq_u32(self.0, rhs.0), vandq_u32(self.1, rhs.1)) + } } else { Self(impl_x8_op!(self, bitand, rhs)) } } } } - -impl core::ops::Shl for u32x8 { - type Output = Self; - - fn shl(self, rhs: i32) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - let shift: __m128i = cast([rhs as u64, 0]); - Self(unsafe { _mm256_sll_epi32(self.0, shift) }) - } else if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - let shift = cast([rhs as u64, 0]); - Self( - unsafe { _mm_sll_epi32(self.0, shift) }, - unsafe { _mm_sll_epi32(self.1, shift) }, - ) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(u32x4_shl(self.0, rhs as _), u32x4_shl(self.1, rhs as _)) - } else { - let u = rhs as u64; - Self([ - self.0[0] << u, - self.0[1] << u, - self.0[2] << u, - self.0[3] << u, - self.0[4] << u, - self.0[5] << u, - self.0[6] << u, - self.0[7] << u, - ]) - } - } - } -} - -impl core::ops::Shr for u32x8 { - type Output = Self; - - fn shr(self, rhs: i32) -> Self::Output { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - let shift: __m128i = cast([rhs as u64, 0]); - Self(unsafe { _mm256_srl_epi32(self.0, shift) }) - } else if #[cfg(all(feature = "simd", target_feature = "sse2"))] { - let shift: __m128i = cast([rhs as u64, 0]); - Self( - unsafe { _mm_srl_epi32(self.0, shift) }, - unsafe { _mm_srl_epi32(self.1, shift) }, - ) - } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { - Self(u32x4_shr(self.0, rhs as _), u32x4_shr(self.1, rhs as _)) - } else { - let u = rhs as u64; - Self([ - self.0[0] >> u, - self.0[1] >> u, - self.0[2] >> u, - self.0[3] >> u, - self.0[4] >> u, - self.0[5] >> u, - self.0[6] >> u, - self.0[7] >> u, - ]) - } - } - } -}