diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 0d33e91..3a29952 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -6,7 +6,7 @@ env:
   CARGO_TERM_COLOR: always
 
 jobs:
-  build:
+  x86:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
@@ -77,3 +77,28 @@ jobs:
       env:
         RUSTFLAGS: -Ctarget-feature=+simd128,+bulk-memory,+nontrapping-fptoint,+sign-ext
       run: cargo test --target wasm32-wasi
+
+  aarch64:
+    runs-on: ubuntu-20.04
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+
+    - name: Install toolchain
+      uses: actions-rs/toolchain@v1
+      with:
+        toolchain: stable
+        override: true
+        target: aarch64-unknown-linux-gnu
+
+    - name: Install cross
+      run: cargo install cross
+
+    - name: Build with minimal features (no_std)
+      run: cross build --target aarch64-unknown-linux-gnu --verbose --no-default-features --features no-std-float
+
+    - name: Run tests without SIMD
+      run: cross test --target aarch64-unknown-linux-gnu --verbose --no-default-features --features png-format
+
+    - name: Run tests with Neon
+      run: cross test --target aarch64-unknown-linux-gnu
diff --git a/Cargo.toml b/Cargo.toml
index b4a9c8b..6669b3e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,8 +30,9 @@ default = ["std", "simd", "png-format"]
 std = ["tiny-skia-path/std"]
 no-std-float = ["tiny-skia-path/no-std-float"]
 
-# Enables x86 SIMD instructions from SSE up to AVX2.
-# Has no effect on non-x86 targets. Present mainly for testing.
+# Enables SIMD instructions on x86 (from SSE up to AVX2), WebAssembly (SIMD128)
+# and Aarch64 (Neon).
+# Has no effect other targets. Present mainly for testing.
 simd = []
 
 # Allows loading and saving `Pixmap` as PNG.
diff --git a/path/src/f32x2_t.rs b/path/src/f32x2_t.rs
index ae700b7..e9f9d09 100644
--- a/path/src/f32x2_t.rs
+++ b/path/src/f32x2_t.rs
@@ -36,22 +36,22 @@ impl f32x2 {
     /// Returns a minimum value.
     pub fn min(self, other: f32x2) -> f32x2 {
         f32x2([
-            self.x().min(other.x()),
-            self.y().min(other.y()),
+            pmin(self.x(), other.x()),
+            pmin(self.y(), other.y()),
         ])
     }
 
     /// Returns a maximum value.
     pub fn max(self, other: f32x2) -> f32x2 {
         f32x2([
-            self.x().max(other.x()),
-            self.y().max(other.y()),
+            pmax(self.x(), other.x()),
+            pmax(self.y(), other.y()),
         ])
     }
 
     /// Returns a maximum of both values.
     pub fn max_component(self) -> f32 {
-        self.x().max(self.y())
+        pmax(self.x(), self.y())
     }
 
     /// Returns the first value.
@@ -104,3 +104,11 @@ impl core::ops::Div<f32x2> for f32x2 {
         ])
     }
 }
+
+fn pmax(a: f32, b: f32) -> f32 {
+    if a < b { b } else { a }
+}
+
+fn pmin(a: f32, b: f32) -> f32 {
+    if b < a { b } else { a }
+}
diff --git a/src/pipeline/highp.rs b/src/pipeline/highp.rs
index bdbde35..5a27ca7 100644
--- a/src/pipeline/highp.rs
+++ b/src/pipeline/highp.rs
@@ -416,7 +416,7 @@ blend_fn2!(color_burn, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8|
         d + s * inv(da),
         s.cmp_eq(f32x8::default()).blend(
             d * inv(sa),
-            sa * (da - da.min((da - d) * sa * s.recip())) + s * inv(da) + d * inv(sa)
+            sa * (da - da.min((da - d) * sa * s.recip_fast())) + s * inv(da) + d * inv(sa)
         )
     )
 );
@@ -426,7 +426,7 @@ blend_fn2!(color_dodge, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8|
         s * inv(da),
         s.cmp_eq(sa).blend(
             s + d * inv(sa),
-            sa * da.min((d * sa) * (sa - s).recip()) + s * inv(da) + d * inv(sa)
+            sa * da.min((d * sa) * (sa - s).recip_fast()) + s * inv(da) + d * inv(sa)
         )
     )
 );
@@ -456,7 +456,7 @@ blend_fn2!(soft_light, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8| {
     //    3. light src, light dst?
     let dark_src = d * (sa + (s2 - sa) * (f32x8::splat(1.0) - m));
     let dark_dst = (m4 * m4 + m4) * (m - f32x8::splat(1.0)) + f32x8::splat(7.0) * m;
-    let lite_dst = m.recip_sqrt().recip() - m;
+    let lite_dst = m.sqrt() - m;
     let lite_src = d * sa + da * (s2 - sa)
         * two(two(d)).cmp_le(da).blend(dark_dst, lite_dst); // 2 or 3?
 
diff --git a/src/wide/f32x16_t.rs b/src/wide/f32x16_t.rs
index 17d1a46..3ddf2df 100644
--- a/src/wide/f32x16_t.rs
+++ b/src/wide/f32x16_t.rs
@@ -74,7 +74,7 @@ impl f32x16 {
 
     pub fn floor(&self) -> Self {
         // Yes, Skia does it in the same way.
-        let roundtrip = self.round_int();
+        let roundtrip = self.round();
         roundtrip - roundtrip.cmp_gt(self).blend(f32x16::splat(1.0), f32x16::splat(0.0))
     }
 
@@ -85,10 +85,10 @@ impl f32x16 {
         ])
     }
 
-    pub fn round_int(&self) -> Self {
+    pub fn round(&self) -> Self {
         Self([
-            self.0[0].round_int().to_f32x8(),
-            self.0[1].round_int().to_f32x8(),
+            self.0[0].round(),
+            self.0[1].round(),
         ])
     }
 
diff --git a/src/wide/f32x4_t.rs b/src/wide/f32x4_t.rs
index 6190f15..4f9db39 100644
--- a/src/wide/f32x4_t.rs
+++ b/src/wide/f32x4_t.rs
@@ -24,6 +24,12 @@ cfg_if::cfg_if! {
         #[derive(Clone, Copy, Debug)]
         #[repr(transparent)]
         pub struct f32x4(v128);
+    } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+        use core::arch::aarch64::*;
+
+        #[derive(Clone, Copy, Debug)]
+        #[repr(C, align(16))]
+        pub struct f32x4(float32x4_t);
     } else {
         #[derive(Clone, Copy, Debug)]
         #[repr(C, align(16))]
@@ -40,34 +46,46 @@ impl f32x4 {
     }
 
     pub fn max(self, rhs: Self) -> Self {
+        // These technically don't have the same semantics for NaN and 0, but it
+        // doesn't seem to matter as Skia does it the same way.
         cfg_if::cfg_if! {
             if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
                 Self(unsafe { _mm_max_ps(self.0, rhs.0) })
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_max(self.0, rhs.0))
+                Self(f32x4_pmax(self.0, rhs.0))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vmaxq_f32(self.0, rhs.0))
+                }
             } else {
                 Self([
-                    self.0[0].max(rhs.0[0]),
-                    self.0[1].max(rhs.0[1]),
-                    self.0[2].max(rhs.0[2]),
-                    self.0[3].max(rhs.0[3]),
+                    super::pmax(self.0[0], rhs.0[0]),
+                    super::pmax(self.0[1], rhs.0[1]),
+                    super::pmax(self.0[2], rhs.0[2]),
+                    super::pmax(self.0[3], rhs.0[3]),
                 ])
             }
         }
     }
 
     pub fn min(self, rhs: Self) -> Self {
+        // These technically don't have the same semantics for NaN and 0, but it
+        // doesn't seem to matter as Skia does it the same way.
         cfg_if::cfg_if! {
             if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
                 Self(unsafe { _mm_min_ps(self.0, rhs.0) })
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_min(self.0, rhs.0))
+                Self(f32x4_pmin(self.0, rhs.0))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vminq_f32(self.0, rhs.0))
+                }
             } else {
                 Self([
-                    self.0[0].min(rhs.0[0]),
-                    self.0[1].min(rhs.0[1]),
-                    self.0[2].min(rhs.0[2]),
-                    self.0[3].min(rhs.0[3]),
+                    super::pmin(self.0[0], rhs.0[0]),
+                    super::pmin(self.0[1], rhs.0[1]),
+                    super::pmin(self.0[2], rhs.0[2]),
+                    super::pmin(self.0[3], rhs.0[3]),
                 ])
             }
         }
@@ -95,6 +113,10 @@ impl core::ops::Add for f32x4 {
                 Self(unsafe { _mm_add_ps(self.0, rhs.0) })
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_add(self.0, rhs.0))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vaddq_f32(self.0, rhs.0))
+                }
             } else {
                 Self([
                     self.0[0] + rhs.0[0],
@@ -122,6 +144,10 @@ impl core::ops::Sub for f32x4 {
                 Self(unsafe { _mm_sub_ps(self.0, rhs.0) })
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_sub(self.0, rhs.0))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vsubq_f32(self.0, rhs.0))
+                }
             } else {
                 Self([
                     self.0[0] - rhs.0[0],
@@ -143,6 +169,10 @@ impl core::ops::Mul for f32x4 {
                 Self(unsafe { _mm_mul_ps(self.0, rhs.0) })
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_mul(self.0, rhs.0))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vmulq_f32(self.0, rhs.0))
+                }
             } else {
                 Self([
                     self.0[0] * rhs.0[0],
diff --git a/src/wide/f32x8_t.rs b/src/wide/f32x8_t.rs
index e9398dc..daa88dc 100644
--- a/src/wide/f32x8_t.rs
+++ b/src/wide/f32x8_t.rs
@@ -32,6 +32,12 @@ cfg_if::cfg_if! {
         #[derive(Clone, Copy, Debug)]
         #[repr(C, align(32))]
         pub struct f32x8(v128, v128);
+    } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+        use core::arch::aarch64::*;
+
+        #[derive(Clone, Copy, Debug)]
+        #[repr(C, align(32))]
+        pub struct f32x8(float32x4_t, float32x4_t);
     } else {
         #[derive(Clone, Copy, Debug)]
         #[repr(C, align(32))]
@@ -57,6 +63,10 @@ impl f32x8 {
         cfg_if::cfg_if! {
             if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_floor(self.0), f32x4_floor(self.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vrndmq_f32(self.0), vrndmq_f32(self.1))
+                }
             } else {
                 let roundtrip: f32x8 = cast(self.trunc_int().to_f32x8());
                 roundtrip - roundtrip.cmp_gt(self).blend(f32x8::splat(1.0), f32x8::default())
@@ -91,6 +101,13 @@ impl f32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_eq(self.0, rhs.0), f32x4_eq(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(
+                        core::mem::transmute(vceqq_f32(self.0, rhs.0)),
+                        core::mem::transmute(vceqq_f32(self.1, rhs.1)),
+                    )
+                }
             } else {
                 Self(impl_x8_cmp!(self, eq, rhs, f32::from_bits(u32::MAX), 0.0))
             }
@@ -108,6 +125,13 @@ impl f32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_ge(self.0, rhs.0), f32x4_ge(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(
+                        core::mem::transmute(vcgeq_f32(self.0, rhs.0)),
+                        core::mem::transmute(vcgeq_f32(self.1, rhs.1)),
+                    )
+                }
             } else {
                 Self(impl_x8_cmp!(self, ge, rhs, f32::from_bits(u32::MAX), 0.0))
             }
@@ -125,6 +149,13 @@ impl f32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_gt(self.0, rhs.0), f32x4_gt(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(
+                        core::mem::transmute(vcgtq_f32(self.0, rhs.0)),
+                        core::mem::transmute(vcgtq_f32(self.1, rhs.1)),
+                    )
+                }
             } else {
                 Self(impl_x8_cmp!(self, gt, rhs, f32::from_bits(u32::MAX), 0.0))
             }
@@ -142,6 +173,13 @@ impl f32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_ne(self.0, rhs.0), f32x4_ne(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(
+                        core::mem::transmute(vmvnq_u32(vceqq_f32(self.0, rhs.0))),
+                        core::mem::transmute(vmvnq_u32(vceqq_f32(self.1, rhs.1))),
+                    )
+                }
             } else {
                 Self(impl_x8_cmp!(self, ne, rhs, f32::from_bits(u32::MAX), 0.0))
             }
@@ -159,6 +197,13 @@ impl f32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_le(self.0, rhs.0), f32x4_le(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(
+                        core::mem::transmute(vcleq_f32(self.0, rhs.0)),
+                        core::mem::transmute(vcleq_f32(self.1, rhs.1)),
+                    )
+                }
             } else {
                 Self(impl_x8_cmp!(self, le, rhs, f32::from_bits(u32::MAX), 0.0))
             }
@@ -176,6 +221,13 @@ impl f32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_lt(self.0, rhs.0), f32x4_lt(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(
+                        core::mem::transmute(vcltq_f32(self.0, rhs.0)),
+                        core::mem::transmute(vcltq_f32(self.1, rhs.1)),
+                    )
+                }
             } else {
                 Self(impl_x8_cmp!(self, lt, rhs, f32::from_bits(u32::MAX), 0.0))
             }
@@ -194,6 +246,21 @@ impl f32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(v128_bitselect(t.0, f.0, self.0), v128_bitselect(t.1, f.1, self.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(
+                        core::mem::transmute(vbslq_u32(
+                            core::mem::transmute(self.0),
+                            core::mem::transmute(t.0),
+                            core::mem::transmute(f.0),
+                        )),
+                        core::mem::transmute(vbslq_u32(
+                            core::mem::transmute(self.1),
+                            core::mem::transmute(t.1),
+                            core::mem::transmute(f.1),
+                        )),
+                    )
+                }
             } else {
                 super::generic_bit_blend(self, t, f)
             }
@@ -204,6 +271,10 @@ impl f32x8 {
         cfg_if::cfg_if! {
             if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_abs(self.0), f32x4_abs(self.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vabsq_f32(self.0), vabsq_f32(self.1))
+                }
             } else {
                 let non_sign_bits = f32x8::splat(f32::from_bits(i32::MAX as u32));
                 self & non_sign_bits
@@ -212,6 +283,8 @@ impl f32x8 {
     }
 
     pub fn max(self, rhs: Self) -> Self {
+        // These technically don't have the same semantics for NaN and 0, but it
+        // doesn't seem to matter as Skia does it the same way.
         cfg_if::cfg_if! {
             if #[cfg(all(feature = "simd", target_feature = "avx"))] {
                 Self(unsafe { _mm256_max_ps(self.0, rhs.0) })
@@ -221,14 +294,29 @@ impl f32x8 {
                     unsafe { _mm_max_ps(self.1, rhs.1) },
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_max(self.0, rhs.0), f32x4_max(self.1, rhs.1))
+                Self(f32x4_pmax(self.0, rhs.0), f32x4_pmax(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vmaxq_f32(self.0, rhs.0), vmaxq_f32(self.1, rhs.1))
+                }
             } else {
-                Self(impl_x8_op!(self, max, rhs))
+                Self([
+                    super::pmax(self.0[0], rhs.0[0]),
+                    super::pmax(self.0[1], rhs.0[1]),
+                    super::pmax(self.0[2], rhs.0[2]),
+                    super::pmax(self.0[3], rhs.0[3]),
+                    super::pmax(self.0[4], rhs.0[4]),
+                    super::pmax(self.0[5], rhs.0[5]),
+                    super::pmax(self.0[6], rhs.0[6]),
+                    super::pmax(self.0[7], rhs.0[7]),
+                ])
             }
         }
     }
 
     pub fn min(self, rhs: Self) -> Self {
+        // These technically don't have the same semantics for NaN and 0, but it
+        // doesn't seem to matter as Skia does it the same way.
         cfg_if::cfg_if! {
             if #[cfg(all(feature = "simd", target_feature = "avx"))] {
                 Self(unsafe { _mm256_min_ps(self.0, rhs.0) })
@@ -238,9 +326,22 @@ impl f32x8 {
                     unsafe { _mm_min_ps(self.1, rhs.1) },
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(f32x4_min(self.0, rhs.0), f32x4_min(self.1, rhs.1))
+                Self(f32x4_pmin(self.0, rhs.0), f32x4_pmin(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vminq_f32(self.0, rhs.0), vminq_f32(self.1, rhs.1))
+                }
             } else {
-                Self(impl_x8_op!(self, min, rhs))
+                Self([
+                    super::pmin(self.0[0], rhs.0[0]),
+                    super::pmin(self.0[1], rhs.0[1]),
+                    super::pmin(self.0[2], rhs.0[2]),
+                    super::pmin(self.0[3], rhs.0[3]),
+                    super::pmin(self.0[4], rhs.0[4]),
+                    super::pmin(self.0[5], rhs.0[5]),
+                    super::pmin(self.0[6], rhs.0[6]),
+                    super::pmin(self.0[7], rhs.0[7]),
+                ])
             }
         }
     }
@@ -248,7 +349,7 @@ impl f32x8 {
     pub fn is_finite(self) -> Self {
         let shifted_exp_mask = u32x8::splat(0xFF000000);
         let u: u32x8 = cast(self);
-        let shift_u = u << 1;
+        let shift_u = u.shl::<1>();
         let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask);
         cast(out)
     }
@@ -264,10 +365,14 @@ impl f32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_nearest(self.0), f32x4_nearest(self.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vrndnq_f32(self.0), vrndnq_f32(self.1))
+                }
             } else {
                 let to_int = f32x8::splat(1.0 / f32::EPSILON);
                 let u: u32x8 = cast(self);
-                let e: i32x8 = cast((u >> 23) & u32x8::splat(0xff));
+                let e: i32x8 = cast(u.shr::<23>() & u32x8::splat(0xff));
                 let mut y: f32x8;
 
                 let no_op_magic = i32x8::splat(0x7f + 23);
@@ -293,6 +398,9 @@ impl f32x8 {
     }
 
     pub fn round_int(self) -> i32x8 {
+        // These technically don't have the same semantics for NaN and out of
+        // range values, but it doesn't seem to matter as Skia does it the same
+        // way.
         cfg_if::cfg_if! {
             if #[cfg(all(feature = "simd", target_feature = "avx"))] {
                 cast(unsafe { _mm256_cvtps_epi32(self.0) })
@@ -304,9 +412,13 @@ impl f32x8 {
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 let rounded = self.round();
                 i32x8(i32x4_trunc_sat_f32x4(rounded.0), i32x4_trunc_sat_f32x4(rounded.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    i32x8(vcvtnq_s32_f32(self.0), vcvtnq_s32_f32(self.1))
+                }
             } else {
                 let rounded: [f32; 8] = cast(self.round());
-                let rounded_ints: i32x8 = cast([
+                cast([
                     rounded[0] as i32,
                     rounded[1] as i32,
                     rounded[2] as i32,
@@ -315,16 +427,15 @@ impl f32x8 {
                     rounded[5] as i32,
                     rounded[6] as i32,
                     rounded[7] as i32,
-                ]);
-                cast::<f32x8, i32x8>(self.is_finite()).blend(
-                    rounded_ints,
-                    i32x8::splat(i32::MIN)
-                )
+                ])
             }
         }
     }
 
     pub fn trunc_int(self) -> i32x8 {
+        // These technically don't have the same semantics for NaN and out of
+        // range values, but it doesn't seem to matter as Skia does it the same
+        // way.
         cfg_if::cfg_if! {
             if #[cfg(all(feature = "simd", target_feature = "avx"))] {
                 cast(unsafe { _mm256_cvttps_epi32(self.0) })
@@ -334,28 +445,28 @@ impl f32x8 {
                     unsafe { _mm_cvttps_epi32(self.1) },
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                cast(Self(
-                    i32x4_trunc_sat_f32x4(self.0),
-                    i32x4_trunc_sat_f32x4(self.1),
-                ))
+                i32x8(i32x4_trunc_sat_f32x4(self.0), i32x4_trunc_sat_f32x4(self.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    i32x8(vcvtq_s32_f32(self.0), vcvtq_s32_f32(self.1))
+                }
             } else {
                 let n: [f32; 8] = cast(self);
-                let ints: i32x8 = cast([
-                    n[0].trunc() as i32,
-                    n[1].trunc() as i32,
-                    n[2].trunc() as i32,
-                    n[3].trunc() as i32,
-                    n[4].trunc() as i32,
-                    n[5].trunc() as i32,
-                    n[6].trunc() as i32,
-                    n[7].trunc() as i32,
-                ]);
-                cast::<f32x8, i32x8>(self.is_finite()).blend(ints,i32x8::splat(i32::MIN))
+                cast([
+                    n[0] as i32,
+                    n[1] as i32,
+                    n[2] as i32,
+                    n[3] as i32,
+                    n[4] as i32,
+                    n[5] as i32,
+                    n[6] as i32,
+                    n[7] as i32,
+                ])
             }
         }
     }
 
-    pub fn recip(self) -> Self {
+    pub fn recip_fast(self) -> Self {
         cfg_if::cfg_if! {
             if #[cfg(all(feature = "simd", target_feature = "avx"))] {
                 Self(unsafe { _mm256_rcp_ps(self.0) })
@@ -370,6 +481,16 @@ impl f32x8 {
                     f32x4_div(one, self.0),
                     f32x4_div(one, self.1),
                 )
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    let a = vrecpeq_f32(self.0);
+                    let a = vmulq_f32(vrecpsq_f32(self.0, a), a);
+
+                    let b = vrecpeq_f32(self.1);
+                    let b = vmulq_f32(vrecpsq_f32(self.1, b), b);
+
+                    Self(a, b)
+                }
             } else {
                 Self::from([
                     1.0 / self.0[0],
@@ -400,6 +521,16 @@ impl f32x8 {
                     f32x4_div(one, f32x4_sqrt(self.0)),
                     f32x4_div(one, f32x4_sqrt(self.1)),
                 )
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    let a = vrsqrteq_f32(self.0);
+                    let a = vmulq_f32(vrsqrtsq_f32(self.0, vmulq_f32(a, a)), a);
+
+                    let b = vrsqrteq_f32(self.1);
+                    let b = vmulq_f32(vrsqrtsq_f32(self.1, vmulq_f32(b, b)), b);
+
+                    Self(a, b)
+                }
             } else {
                 Self::from([
                     1.0 / self.0[0].sqrt(),
@@ -426,6 +557,10 @@ impl f32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_sqrt(self.0), f32x4_sqrt(self.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vsqrtq_f32(self.0), vsqrtq_f32(self.1))
+                }
             } else {
                 Self::from([
                     self.0[0].sqrt(),
@@ -468,6 +603,10 @@ impl core::ops::Add for f32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_add(self.0, rhs.0), f32x4_add(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vaddq_f32(self.0, rhs.0), vaddq_f32(self.1, rhs.1))
+                }
             } else {
                 Self(impl_x8_op!(self, add, rhs))
             }
@@ -495,6 +634,10 @@ impl core::ops::Sub for f32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_sub(self.0, rhs.0), f32x4_sub(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vsubq_f32(self.0, rhs.0), vsubq_f32(self.1, rhs.1))
+                }
             } else {
                 Self(impl_x8_op!(self, sub, rhs))
             }
@@ -516,6 +659,10 @@ impl core::ops::Mul for f32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_mul(self.0, rhs.0), f32x4_mul(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vmulq_f32(self.0, rhs.0), vmulq_f32(self.1, rhs.1))
+                }
             } else {
                 Self(impl_x8_op!(self, mul, rhs))
             }
@@ -543,6 +690,10 @@ impl core::ops::Div for f32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_div(self.0, rhs.0), f32x4_div(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vdivq_f32(self.0, rhs.0), vdivq_f32(self.1, rhs.1))
+                }
             } else {
                 Self(impl_x8_op!(self, div, rhs))
             }
@@ -565,6 +716,19 @@ impl core::ops::BitAnd for f32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(v128_and(self.0, rhs.0), v128_and(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(
+                        core::mem::transmute(vandq_u32(
+                            core::mem::transmute(self.0),
+                            core::mem::transmute(rhs.0),
+                        )),
+                        core::mem::transmute(vandq_u32(
+                            core::mem::transmute(self.1),
+                            core::mem::transmute(rhs.1),
+                        )),
+                    )
+                }
             } else {
                 Self([
                     f32::from_bits(self.0[0].to_bits() & rhs.0[0].to_bits()),
@@ -596,6 +760,19 @@ impl core::ops::BitOr for f32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(v128_or(self.0, rhs.0), v128_or(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(
+                        core::mem::transmute(vorrq_u32(
+                            core::mem::transmute(self.0),
+                            core::mem::transmute(rhs.0),
+                        )),
+                        core::mem::transmute(vorrq_u32(
+                            core::mem::transmute(self.1),
+                            core::mem::transmute(rhs.1),
+                        )),
+                    )
+                }
             } else {
                 Self([
                     f32::from_bits(self.0[0].to_bits() | rhs.0[0].to_bits()),
@@ -627,6 +804,19 @@ impl core::ops::BitXor for f32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(v128_xor(self.0, rhs.0), v128_xor(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(
+                        core::mem::transmute(veorq_u32(
+                            core::mem::transmute(self.0),
+                            core::mem::transmute(rhs.0),
+                        )),
+                        core::mem::transmute(veorq_u32(
+                            core::mem::transmute(self.1),
+                            core::mem::transmute(rhs.1),
+                        )),
+                    )
+                }
             } else {
                 Self([
                     f32::from_bits(self.0[0].to_bits() ^ rhs.0[0].to_bits()),
@@ -666,6 +856,13 @@ impl core::ops::Not for f32x8 {
                 }
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(v128_not(self.0), v128_not(self.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(
+                        core::mem::transmute(vmvnq_u32(core::mem::transmute(self.0))),
+                        core::mem::transmute(vmvnq_u32(core::mem::transmute(self.1))),
+                    )
+                }
             } else {
                 self ^ Self::splat(cast(u32::MAX))
             }
@@ -681,6 +878,13 @@ impl core::cmp::PartialEq for f32x8 {
                 unsafe { _mm256_movemask_ps(mask) == 0b1111_1111 }
             } else if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
                 unsafe { _mm_movemask_ps(_mm_cmpeq_ps(self.0, rhs.0)) == 0b1111 }
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    vminvq_u32(vandq_u32(
+                        vceqq_f32(self.0, rhs.0),
+                        vceqq_f32(self.1, rhs.1),
+                    )) != 0
+                }
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 u32x4_all_true(f32x4_eq(self.0, rhs.0)) &
                 u32x4_all_true(f32x4_eq(self.1, rhs.1))
diff --git a/src/wide/i32x8_t.rs b/src/wide/i32x8_t.rs
index ccee170..dd3c769 100644
--- a/src/wide/i32x8_t.rs
+++ b/src/wide/i32x8_t.rs
@@ -29,6 +29,12 @@ cfg_if::cfg_if! {
         #[derive(Clone, Copy, Debug)]
         #[repr(C, align(32))]
         pub struct i32x8(pub v128, pub v128);
+    } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+        use core::arch::aarch64::*;
+
+        #[derive(Clone, Copy, Debug)]
+        #[repr(C, align(32))]
+        pub struct i32x8(pub int32x4_t, pub int32x4_t);
     } else {
         #[derive(Clone, Copy, Debug)]
         #[repr(C, align(32))]
@@ -61,6 +67,13 @@ impl i32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(v128_bitselect(t.0, f.0, self.0), v128_bitselect(t.1, f.1, self.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(
+                        vbslq_s32(core::mem::transmute(self.0), t.0, f.0),
+                        vbslq_s32(core::mem::transmute(self.1), t.1, f.1),
+                    )
+                }
             } else {
                 super::generic_bit_blend(self, t, f)
             }
@@ -78,6 +91,13 @@ impl i32x8 {
                 ))
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(i32x4_eq(self.0, rhs.0), i32x4_eq(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(
+                        core::mem::transmute(vceqq_s32(self.0, rhs.0)),
+                        core::mem::transmute(vceqq_s32(self.1, rhs.1)),
+                    )
+                }
             } else {
                 Self(impl_x8_cmp!(self, eq, rhs, -1, 0))
             }
@@ -95,6 +115,13 @@ impl i32x8 {
                 ))
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(i32x4_gt(self.0, rhs.0), i32x4_eq(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(
+                        core::mem::transmute(vcgtq_s32(self.0, rhs.0)),
+                        core::mem::transmute(vcgtq_s32(self.1, rhs.1)),
+                    )
+                }
             } else {
                 Self(impl_x8_cmp!(self, gt, rhs, -1, 0))
             }
@@ -116,6 +143,13 @@ impl i32x8 {
                 ))
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(i32x4_lt(self.0, rhs.0), i32x4_lt(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(
+                        core::mem::transmute(vcltq_s32(self.0, rhs.0)),
+                        core::mem::transmute(vcltq_s32(self.1, rhs.1)),
+                    )
+                }
             } else {
                 Self(impl_x8_cmp!(self, lt, rhs, -1, 0))
             }
@@ -133,6 +167,13 @@ impl i32x8 {
                 ))
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 cast(Self(f32x4_convert_i32x4(self.0), f32x4_convert_i32x4(self.1)))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    cast(Self(
+                        core::mem::transmute(vcvtq_f32_s32(self.0)),
+                        core::mem::transmute(vcvtq_f32_s32(self.1)),
+                    ))
+                }
             } else {
                 let arr: [i32; 8] = cast(self);
                 cast([
@@ -184,6 +225,10 @@ impl core::ops::Add for i32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(i32x4_add(self.0, rhs.0), i32x4_add(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vaddq_s32(self.0, rhs.0), vaddq_s32(self.1, rhs.1))
+                }
             } else {
                 Self(impl_x8_op!(self, wrapping_add, rhs))
             }
@@ -205,6 +250,10 @@ impl core::ops::BitAnd for i32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(v128_and(self.0, rhs.0), v128_and(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vandq_s32(self.0, rhs.0), vandq_s32(self.1, rhs.1))
+                }
             } else {
                 Self(impl_x8_op!(self, bitand, rhs))
             }
@@ -226,6 +275,10 @@ impl core::ops::Mul for i32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(i32x4_mul(self.0, rhs.0), i32x4_mul(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vmulq_s32(self.0, rhs.0), vmulq_s32(self.1, rhs.1))
+                }
             } else {
                 struct Dummy([i32; 8]);
                 let arr1: [i32; 8] = cast(self);
@@ -251,6 +304,10 @@ impl core::ops::BitOr for i32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(v128_or(self.0, rhs.0), v128_or(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vorrq_s32(self.0, rhs.0), vorrq_s32(self.1, rhs.1))
+                }
             } else {
                 Self(impl_x8_op!(self, bitor, rhs))
             }
@@ -273,6 +330,10 @@ impl core::ops::BitXor for i32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(v128_xor(self.0, rhs.0), v128_xor(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(veorq_s32(self.0, rhs.0), veorq_s32(self.1, rhs.1))
+                }
             } else {
                 Self(impl_x8_op!(self, bitxor, rhs))
             }
diff --git a/src/wide/mod.rs b/src/wide/mod.rs
index 4b5afd4..d660ca8 100644
--- a/src/wide/mod.rs
+++ b/src/wide/mod.rs
@@ -59,6 +59,16 @@ pub use u32x8_t::u32x8;
 pub use f32x16_t::f32x16;
 pub use u16x16_t::u16x16;
 
+#[allow(dead_code)]
+fn pmax(a: f32, b: f32) -> f32 {
+    if a < b { b } else { a }
+}
+
+#[allow(dead_code)]
+fn pmin(a: f32, b: f32) -> f32 {
+    if b < a { b } else { a }
+}
+
 #[allow(dead_code)]
 #[inline]
 pub fn generic_bit_blend<T>(mask: T, y: T, n: T) -> T
diff --git a/src/wide/u32x8_t.rs b/src/wide/u32x8_t.rs
index ba22f66..b5c1570 100644
--- a/src/wide/u32x8_t.rs
+++ b/src/wide/u32x8_t.rs
@@ -31,6 +31,12 @@ cfg_if::cfg_if! {
         #[derive(Clone, Copy, Debug)]
         #[repr(C, align(32))]
         pub struct u32x8(v128, v128);
+    } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+        use core::arch::aarch64::*;
+
+        #[derive(Clone, Copy, Debug)]
+        #[repr(C, align(32))]
+        pub struct u32x8(uint32x4_t, uint32x4_t);
     } else {
         #[derive(Clone, Copy, Debug)]
         #[repr(C, align(32))]
@@ -71,11 +77,81 @@ impl u32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(u32x4_eq(self.0, rhs.0), u32x4_eq(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vceqq_u32(self.0, rhs.0), vceqq_u32(self.1, rhs.1))
+                }
             } else {
                 Self(impl_x8_cmp!(self, eq, rhs, u32::MAX, 0))
             }
         }
     }
+
+    pub fn shl<const RHS: i32>(self) -> Self {
+        cfg_if::cfg_if! {
+           if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
+                let shift: __m128i = cast([RHS as u64, 0]);
+                Self(unsafe { _mm256_sll_epi32(self.0, shift) })
+            } else if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
+                let shift = cast([RHS as u64, 0]);
+                Self(
+                    unsafe { _mm_sll_epi32(self.0, shift) },
+                    unsafe { _mm_sll_epi32(self.1, shift) },
+                )
+            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
+                Self(u32x4_shl(self.0, RHS as _), u32x4_shl(self.1, RHS as _))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vshlq_n_u32::<RHS>(self.0), vshlq_n_u32::<RHS>(self.1))
+                }
+            } else {
+                let u = RHS as u64;
+                Self([
+                    self.0[0] << u,
+                    self.0[1] << u,
+                    self.0[2] << u,
+                    self.0[3] << u,
+                    self.0[4] << u,
+                    self.0[5] << u,
+                    self.0[6] << u,
+                    self.0[7] << u,
+                ])
+            }
+        }
+    }
+
+    pub fn shr<const RHS: i32>(self) -> Self {
+        cfg_if::cfg_if! {
+            if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
+                let shift: __m128i = cast([RHS as u64, 0]);
+                Self(unsafe { _mm256_srl_epi32(self.0, shift) })
+            } else if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
+                let shift: __m128i = cast([RHS as u64, 0]);
+                Self(
+                    unsafe { _mm_srl_epi32(self.0, shift) },
+                    unsafe { _mm_srl_epi32(self.1, shift) },
+                )
+            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
+                Self(u32x4_shr(self.0, RHS as _), u32x4_shr(self.1, RHS as _))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vshrq_n_u32::<RHS>(self.0), vshrq_n_u32::<RHS>(self.1))
+                }
+            } else {
+                let u = RHS as u64;
+                Self([
+                    self.0[0] >> u,
+                    self.0[1] >> u,
+                    self.0[2] >> u,
+                    self.0[3] >> u,
+                    self.0[4] >> u,
+                    self.0[5] >> u,
+                    self.0[6] >> u,
+                    self.0[7] >> u,
+                ])
+            }
+        }
+    }
 }
 
 impl core::ops::Not for u32x8 {
@@ -94,6 +170,10 @@ impl core::ops::Not for u32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(v128_not(self.0), v128_not(self.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vmvnq_u32(self.0), vmvnq_u32(self.1))
+                }
             } else {
                 Self([
                     !self.0[0],
@@ -124,6 +204,10 @@ impl core::ops::Add for u32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(u32x4_add(self.0, rhs.0), u32x4_add(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vaddq_u32(self.0, rhs.0), vaddq_u32(self.1, rhs.1))
+                }
             } else {
                 Self(impl_x8_op!(self, wrapping_add, rhs))
             }
@@ -145,75 +229,13 @@ impl core::ops::BitAnd for u32x8 {
                 )
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(v128_and(self.0, rhs.0), v128_and(self.1, rhs.1))
+            } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
+                unsafe {
+                    Self(vandq_u32(self.0, rhs.0), vandq_u32(self.1, rhs.1))
+                }
             } else {
                 Self(impl_x8_op!(self, bitand, rhs))
             }
         }
     }
 }
-
-impl core::ops::Shl<i32> for u32x8 {
-    type Output = Self;
-
-    fn shl(self, rhs: i32) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-                let shift: __m128i = cast([rhs as u64, 0]);
-                Self(unsafe { _mm256_sll_epi32(self.0, shift) })
-            } else if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                let shift = cast([rhs as u64, 0]);
-                Self(
-                    unsafe { _mm_sll_epi32(self.0, shift) },
-                    unsafe { _mm_sll_epi32(self.1, shift) },
-                )
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(u32x4_shl(self.0, rhs as _), u32x4_shl(self.1, rhs as _))
-            } else {
-                let u = rhs as u64;
-                Self([
-                    self.0[0] << u,
-                    self.0[1] << u,
-                    self.0[2] << u,
-                    self.0[3] << u,
-                    self.0[4] << u,
-                    self.0[5] << u,
-                    self.0[6] << u,
-                    self.0[7] << u,
-                ])
-            }
-        }
-    }
-}
-
-impl core::ops::Shr<i32> for u32x8 {
-    type Output = Self;
-
-    fn shr(self, rhs: i32) -> Self::Output {
-        cfg_if::cfg_if! {
-            if #[cfg(all(feature = "simd", target_feature = "avx2"))] {
-                let shift: __m128i = cast([rhs as u64, 0]);
-                Self(unsafe { _mm256_srl_epi32(self.0, shift) })
-            } else if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
-                let shift: __m128i = cast([rhs as u64, 0]);
-                Self(
-                    unsafe { _mm_srl_epi32(self.0, shift) },
-                    unsafe { _mm_srl_epi32(self.1, shift) },
-                )
-            } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
-                Self(u32x4_shr(self.0, rhs as _), u32x4_shr(self.1, rhs as _))
-            } else {
-                let u = rhs as u64;
-                Self([
-                    self.0[0] >> u,
-                    self.0[1] >> u,
-                    self.0[2] >> u,
-                    self.0[3] >> u,
-                    self.0[4] >> u,
-                    self.0[5] >> u,
-                    self.0[6] >> u,
-                    self.0[7] >> u,
-                ])
-            }
-        }
-    }
-}