Implement Neon SIMD

Closes #46
RazrFalcon · Aug 24, 2022 · 3457f25 · 3457f25
1 parent 375ab05
commit 3457f25
Show file tree

Hide file tree

Showing 10 changed files with 480 additions and 119 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -6,7 +6,7 @@ env:
   CARGO_TERM_COLOR: always
 
 jobs:
-  build:
+  x86:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
@@ -77,3 +77,28 @@ jobs:
       env:
         RUSTFLAGS: -Ctarget-feature=+simd128,+bulk-memory,+nontrapping-fptoint,+sign-ext
       run: cargo test --target wasm32-wasi
+
+  aarch64:
+    runs-on: ubuntu-20.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+
+    - name: Install toolchain
+      uses: actions-rs/toolchain@v1
+      with:
+        toolchain: stable
+        override: true
+        target: aarch64-unknown-linux-gnu
+
+    - name: Install cross
+      run: cargo install cross
+
+    - name: Build with minimal features (no_std)
+      run: cross build --target aarch64-unknown-linux-gnu --verbose --no-default-features --features no-std-float
+
+    - name: Run tests without SIMD
+      run: cross test --target aarch64-unknown-linux-gnu --verbose --no-default-features --features png-format
+
+    - name: Run tests with Neon
+      run: cross test --target aarch64-unknown-linux-gnu
diff --git a/Cargo.toml b/Cargo.toml
@@ -30,8 +30,9 @@ default = ["std", "simd", "png-format"]
 std = ["tiny-skia-path/std"]
 no-std-float = ["tiny-skia-path/no-std-float"]
 
-# Enables x86 SIMD instructions from SSE up to AVX2.
-# Has no effect on non-x86 targets. Present mainly for testing.
+# Enables SIMD instructions on x86 (from SSE up to AVX2), WebAssembly (SIMD128)
+# and Aarch64 (Neon).
+# Has no effect other targets. Present mainly for testing.
 simd = []
 
 # Allows loading and saving `Pixmap` as PNG.

diff --git a/path/src/f32x2_t.rs b/path/src/f32x2_t.rs
@@ -36,22 +36,22 @@ impl f32x2 {
     /// Returns a minimum value.
     pub fn min(self, other: f32x2) -> f32x2 {
         f32x2([
-            self.x().min(other.x()),
-            self.y().min(other.y()),
+            pmin(self.x(), other.x()),
+            pmin(self.y(), other.y()),
         ])
     }
 
     /// Returns a maximum value.
     pub fn max(self, other: f32x2) -> f32x2 {
         f32x2([
-            self.x().max(other.x()),
-            self.y().max(other.y()),
+            pmax(self.x(), other.x()),
+            pmax(self.y(), other.y()),
         ])
     }
 
     /// Returns a maximum of both values.
     pub fn max_component(self) -> f32 {
-        self.x().max(self.y())
+        pmax(self.x(), self.y())
     }
 
     /// Returns the first value.
@@ -104,3 +104,11 @@ impl core::ops::Div<f32x2> for f32x2 {
         ])
     }
 }
+
+fn pmax(a: f32, b: f32) -> f32 {
+    if a < b { b } else { a }
+}
+
+fn pmin(a: f32, b: f32) -> f32 {
+    if b < a { b } else { a }
+}
diff --git a/src/pipeline/highp.rs b/src/pipeline/highp.rs
@@ -416,7 +416,7 @@ blend_fn2!(color_burn, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8|
         d + s * inv(da),
         s.cmp_eq(f32x8::default()).blend(
             d * inv(sa),
-            sa * (da - da.min((da - d) * sa * s.recip())) + s * inv(da) + d * inv(sa)
+            sa * (da - da.min((da - d) * sa * s.recip_fast())) + s * inv(da) + d * inv(sa)
         )
     )
 );
@@ -426,7 +426,7 @@ blend_fn2!(color_dodge, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8|
         s * inv(da),
         s.cmp_eq(sa).blend(
             s + d * inv(sa),
-            sa * da.min((d * sa) * (sa - s).recip()) + s * inv(da) + d * inv(sa)
+            sa * da.min((d * sa) * (sa - s).recip_fast()) + s * inv(da) + d * inv(sa)
         )
     )
 );
@@ -456,7 +456,7 @@ blend_fn2!(soft_light, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8| {
     //    3. light src, light dst?
     let dark_src = d * (sa + (s2 - sa) * (f32x8::splat(1.0) - m));
     let dark_dst = (m4 * m4 + m4) * (m - f32x8::splat(1.0)) + f32x8::splat(7.0) * m;
-    let lite_dst = m.recip_sqrt().recip() - m;
+    let lite_dst = m.sqrt() - m;
     let lite_src = d * sa + da * (s2 - sa)
         * two(two(d)).cmp_le(da).blend(dark_dst, lite_dst); // 2 or 3?
 

diff --git a/src/wide/f32x16_t.rs b/src/wide/f32x16_t.rs
@@ -74,7 +74,7 @@ impl f32x16 {
 
     pub fn floor(&self) -> Self {
         // Yes, Skia does it in the same way.
-        let roundtrip = self.round_int();
+        let roundtrip = self.round();
         roundtrip - roundtrip.cmp_gt(self).blend(f32x16::splat(1.0), f32x16::splat(0.0))
     }
 
@@ -85,10 +85,10 @@ impl f32x16 {
         ])
     }
 
-    pub fn round_int(&self) -> Self {
+    pub fn round(&self) -> Self {
         Self([
-            self.0[0].round_int().to_f32x8(),
-            self.0[1].round_int().to_f32x8(),
+            self.0[0].round(),
+            self.0[1].round(),
         ])
     }
 

diff --git a/src/wide/f32x4_t.rs b/src/wide/f32x4_t.rs
@@ -24,6 +24,12 @@ cfg_if::cfg_if! {
         #[derive(Clone, Copy, Debug)]
         #[repr(transparent)]
         pub struct f32x4(v128);
+    } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+        use core::arch::aarch64::*;
+
+        #[derive(Clone, Copy, Debug)]
+        #[repr(C, align(16))]
+        pub struct f32x4(float32x4_t);
     } else {
         #[derive(Clone, Copy, Debug)]
         #[repr(C, align(16))]
@@ -40,34 +46,46 @@ impl f32x4 {
     }
 
     pub fn max(self, rhs: Self) -> Self {
+        // These technically don't have the same semantics for NaN and 0, but it
+        // doesn't seem to matter as Skia does it the same way.
         cfg_if::cfg_if! {
             if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
                 Self(unsafe { _mm_max_ps(self.0, rhs.0) })
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_max(self.0, rhs.0))
+                Self(f32x4_pmax(self.0, rhs.0))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vmaxq_f32(self.0, rhs.0))
+                }
             } else {
                 Self([
-                    self.0[0].max(rhs.0[0]),
-                    self.0[1].max(rhs.0[1]),
-                    self.0[2].max(rhs.0[2]),
-                    self.0[3].max(rhs.0[3]),
+                    super::pmax(self.0[0], rhs.0[0]),
+                    super::pmax(self.0[1], rhs.0[1]),
+                    super::pmax(self.0[2], rhs.0[2]),
+                    super::pmax(self.0[3], rhs.0[3]),
                 ])
             }
         }
     }
 
     pub fn min(self, rhs: Self) -> Self {
+        // These technically don't have the same semantics for NaN and 0, but it
+        // doesn't seem to matter as Skia does it the same way.
         cfg_if::cfg_if! {
             if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
                 Self(unsafe { _mm_min_ps(self.0, rhs.0) })
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_min(self.0, rhs.0))
+                Self(f32x4_pmin(self.0, rhs.0))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vminq_f32(self.0, rhs.0))
+                }
             } else {
                 Self([
-                    self.0[0].min(rhs.0[0]),
-                    self.0[1].min(rhs.0[1]),
-                    self.0[2].min(rhs.0[2]),
-                    self.0[3].min(rhs.0[3]),
+                    super::pmin(self.0[0], rhs.0[0]),
+                    super::pmin(self.0[1], rhs.0[1]),
+                    super::pmin(self.0[2], rhs.0[2]),
+                    super::pmin(self.0[3], rhs.0[3]),
                 ])
             }
         }
@@ -95,6 +113,10 @@ impl core::ops::Add for f32x4 {
                 Self(unsafe { _mm_add_ps(self.0, rhs.0) })
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_add(self.0, rhs.0))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vaddq_f32(self.0, rhs.0))
+                }
             } else {
                 Self([
                     self.0[0] + rhs.0[0],
@@ -122,6 +144,10 @@ impl core::ops::Sub for f32x4 {
                 Self(unsafe { _mm_sub_ps(self.0, rhs.0) })
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_sub(self.0, rhs.0))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vsubq_f32(self.0, rhs.0))
+                }
             } else {
                 Self([
                     self.0[0] - rhs.0[0],
@@ -143,6 +169,10 @@ impl core::ops::Mul for f32x4 {
                 Self(unsafe { _mm_mul_ps(self.0, rhs.0) })
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_mul(self.0, rhs.0))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vmulq_f32(self.0, rhs.0))
+                }
             } else {
                 Self([
                     self.0[0] * rhs.0[0],