54 files changed, 3475 insertions, 1876 deletions
diff --git a/library/stdarch/.github/workflows/main.yml b/library/stdarch/.github/workflows/main.yml
index 2de81e5a2..014a9aca0 100644
--- a/library/stdarch/.github/workflows/main.yml
+++ b/library/stdarch/.github/workflows/main.yml
@@ -74,13 +74,15 @@ jobs:
         - aarch64-unknown-linux-gnu
         - riscv64gc-unknown-linux-gnu
         - powerpc64le-unknown-linux-gnu
-        - mips-unknown-linux-gnu
-        - mips64-unknown-linux-gnuabi64
-        - mips64el-unknown-linux-gnuabi64
+        # MIPS targets disabled since they are dropped to tier 3.
+        # See https://github.com/rust-lang/compiler-team/issues/648
+        #- mips-unknown-linux-gnu
+        #- mips64-unknown-linux-gnuabi64
+        #- mips64el-unknown-linux-gnuabi64
+        #- mipsel-unknown-linux-musl
         - s390x-unknown-linux-gnu
         - wasm32-wasi
         - i586-unknown-linux-gnu
-        - mipsel-unknown-linux-musl
         - nvptx64-nvidia-cuda
         - thumbv6m-none-eabi
         - thumbv7m-none-eabi
@@ -114,15 +116,20 @@ jobs:
           os: ubuntu-latest
         - target: armv7-unknown-linux-gnueabihf
           os: ubuntu-latest
-        - target: mips-unknown-linux-gnu
-          os: ubuntu-latest
-          norun: true
-        - target: mips64-unknown-linux-gnuabi64
-          os: ubuntu-latest
-          norun: true
-        - target: mips64el-unknown-linux-gnuabi64
-          os: ubuntu-latest
-          norun: true
+        # MIPS targets disabled since they are dropped to tier 3.
+        # See https://github.com/rust-lang/compiler-team/issues/648
+        #- target: mips-unknown-linux-gnu
+        #  os: ubuntu-latest
+        #  norun: true
+        #- target: mips64-unknown-linux-gnuabi64
+        #  os: ubuntu-latest
+        #  norun: true
+        #- target: mips64el-unknown-linux-gnuabi64
+        #  os: ubuntu-latest
+        #  norun: true
+        #- target: mipsel-unknown-linux-musl
+        #  os: ubuntu-latest
+        #  norun: 1
         - target: powerpc64le-unknown-linux-gnu
           os: ubuntu-latest
           disable_assert_instr: true
@@ -143,9 +150,6 @@ jobs:
           os: windows-latest
         - target: i586-unknown-linux-gnu
           os: ubuntu-latest
-        - target: mipsel-unknown-linux-musl
-          os: ubuntu-latest
-          norun: 1
         - target: nvptx64-nvidia-cuda
           os: ubuntu-latest
         - target: thumbv6m-none-eabi
diff --git a/library/stdarch/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
index b9b3c682e..7ea795cac 100644
--- a/library/stdarch/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
+++ b/library/stdarch/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
@@ -1,10 +1,9 @@
-FROM ubuntu:22.04
+FROM ubuntu:23.04
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         gcc libc6-dev qemu-user ca-certificates \
-        gcc-riscv64-linux-gnu libc6-dev-riscv64-cross \
-        qemu-user
+        gcc-riscv64-linux-gnu libc6-dev-riscv64-cross
 
 ENV CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_LINKER=riscv64-linux-gnu-gcc \
-    CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_RUNNER="qemu-riscv64 -L /usr/riscv64-linux-gnu" \
+    CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_RUNNER="qemu-riscv64 -L /usr/riscv64-linux-gnu -cpu rv64,zk=true,zbb=true,zbc=true" \
     OBJDUMP=riscv64-linux-gnu-objdump
diff --git a/library/stdarch/ci/dox.sh b/library/stdarch/ci/dox.sh
index 3e507b456..cc207cb35 100755
--- a/library/stdarch/ci/dox.sh
+++ b/library/stdarch/ci/dox.sh
@@ -45,6 +45,8 @@ dox arm armv7-unknown-linux-gnueabihf
 dox aarch64 aarch64-unknown-linux-gnu
 dox powerpc powerpc-unknown-linux-gnu
 dox powerpc64le powerpc64le-unknown-linux-gnu
-dox mips mips-unknown-linux-gnu
-dox mips64 mips64-unknown-linux-gnuabi64
+# MIPS targets disabled since they are dropped to tier 3.
+# See https://github.com/rust-lang/compiler-team/issues/648
+#dox mips mips-unknown-linux-gnu
+#dox mips64 mips64-unknown-linux-gnuabi64
 dox wasm32 wasm32-unknown-unknown
diff --git a/library/stdarch/ci/run-docker.sh b/library/stdarch/ci/run-docker.sh
index 32209d96c..59170439c 100755
--- a/library/stdarch/ci/run-docker.sh
+++ b/library/stdarch/ci/run-docker.sh
@@ -5,6 +5,11 @@
 
 set -ex
 
+if [ $# -lt 1 ]; then
+    >&2 echo "Usage: $0 <TARGET>"
+    exit 1
+fi
+
 run() {
     target=$(echo "${1}" | sed 's/-emulated//')
     echo "Building docker container for TARGET=${1}"
diff --git a/library/stdarch/ci/run.sh b/library/stdarch/ci/run.sh
index 1c8e219e6..7b2416fda 100755
--- a/library/stdarch/ci/run.sh
+++ b/library/stdarch/ci/run.sh
@@ -47,6 +47,7 @@ case ${TARGET} in
     # Some of our test dependencies use the deprecated `gcc` crates which
     # doesn't detect RISC-V compilers automatically, so do it manually here.
     riscv64*)
+        export RUSTFLAGS="${RUSTFLAGS} -Ctarget-feature=+zk,+zbb,+zbc"
         export TARGET_CC="riscv64-linux-gnu-gcc"
         ;;
 esac
@@ -76,6 +77,11 @@ cargo_test() {
         # qemu has an erratic behavior on those tests
         powerpc64*)
             cmd="$cmd --skip test_vec_lde_u16 --skip test_vec_lde_u32 --skip test_vec_expte"
+            ;;
+        # Miscompilation: https://github.com/rust-lang/rust/issues/112460
+        arm*)
+            cmd="$cmd --skip vld2q_dup_f32"
+            ;;
     esac
 
     if [ "$SKIP_TESTS" != "" ]; then
diff --git a/library/stdarch/crates/core_arch/src/aarch64/armclang.rs b/library/stdarch/crates/core_arch/src/aarch64/armclang.rs
deleted file mode 100644
index 9a608702a..000000000
--- a/library/stdarch/crates/core_arch/src/aarch64/armclang.rs
+++ /dev/null
@@ -1,23 +0,0 @@
-//! ARM compiler specific intrinsics
-//!
-//! # References
-//!
-//! - [ARM Compiler v 6.10 - armclang Reference Guide][arm_comp_ref]
-//!
-//! [arm_comp_ref]: https://developer.arm.com/docs/100067/0610
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Inserts a breakpoint instruction.
-///
-/// `VAL` is a compile-time constant integer in range `[0, 65535]`.
-///
-/// The breakpoint instruction inserted is `BRK` on A64.
-#[cfg_attr(test, assert_instr(brk, VAL = 0))]
-#[inline(always)]
-#[rustc_legacy_const_generics(0)]
-pub unsafe fn __breakpoint<const VAL: i32>() {
-    static_assert_uimm_bits!(VAL, 16);
-    crate::arch::asm!("brk {}", const VAL);
-}
diff --git a/library/stdarch/crates/core_arch/src/aarch64/mod.rs b/library/stdarch/crates/core_arch/src/aarch64/mod.rs
index 0411fc106..c31989dd3 100644
--- a/library/stdarch/crates/core_arch/src/aarch64/mod.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/mod.rs
@@ -6,9 +6,6 @@
 //! [arm_ref]: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
 //! [arm_dat]: https://developer.arm.com/technologies/neon/intrinsics
 
-mod v8;
-pub use self::v8::*;
-
 mod neon;
 pub use self::neon::*;
 
@@ -23,19 +20,8 @@ pub use self::prefetch::*;
 
 pub use super::arm_shared::*;
 
-mod armclang;
-
-pub use self::armclang::*;
-
 #[cfg(test)]
 use stdarch_test::assert_instr;
 
-/// Generates the trap instruction `BRK 1`
-#[cfg_attr(test, assert_instr(brk))]
-#[inline]
-pub unsafe fn brk() -> ! {
-    crate::intrinsics::abort()
-}
-
 #[cfg(test)]
 pub(crate) mod test_support;
diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
index da7fdf8b1..20dec6d80 100644
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
@@ -926,7 +926,7 @@ pub unsafe fn vcgtq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t {
     simd_gt(a, b)
 }
 
-/// Compare unsigned highe
+/// Compare unsigned greater than
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgt_u64)
 #[inline]
@@ -937,7 +937,7 @@ pub unsafe fn vcgt_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
     simd_gt(a, b)
 }
 
-/// Compare unsigned highe
+/// Compare unsigned greater than
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtq_u64)
 #[inline]
@@ -8353,6 +8353,62 @@ pub unsafe fn vst4q_lane_f64<const LANE: i32>(a: *mut f64, b: float64x2x4_t) {
     vst4q_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
 }
 
+/// Dot product index form with unsigned and signed integers
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_laneq_s32)
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(test, assert_instr(usdot, LANE = 3))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vusdot_laneq_s32<const LANE: i32>(a: int32x2_t, b: uint8x8_t, c: int8x16_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    let c: int32x4_t = transmute(c);
+    let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+    vusdot_s32(a, b, transmute(c))
+}
+
+/// Dot product index form with unsigned and signed integers
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_laneq_s32)
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(test, assert_instr(usdot, LANE = 3))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vusdotq_laneq_s32<const LANE: i32>(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    let c: int32x4_t = transmute(c);
+    let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vusdotq_s32(a, b, transmute(c))
+}
+
+/// Dot product index form with signed and unsigned integers
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudot_laneq_s32)
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(test, assert_instr(sudot, LANE = 3))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vsudot_laneq_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: uint8x16_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 2);
+    let c: uint32x4_t = transmute(c);
+    let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+    vusdot_s32(a, transmute(c), b)
+}
+
+/// Dot product index form with signed and unsigned integers
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudotq_laneq_s32)
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(test, assert_instr(sudot, LANE = 3))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vsudotq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: uint8x16_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 2);
+    let c: uint32x4_t = transmute(c);
+    let c: uint32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vusdotq_s32(a, transmute(c), b)
+}
+
 /// Multiply
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_f64)
@@ -10501,80 +10557,7 @@ pub unsafe fn vcmlaq_rot270_laneq_f32<const LANE: i32>(a: float32x4_t, b: float3
     vcmlaq_rot270_f32(a, b, c)
 }
 
-/// Dot product arithmetic
-///
-/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_s32)
-#[inline]
-#[target_feature(enable = "neon,dotprod")]
-#[cfg_attr(test, assert_instr(sdot))]
-pub unsafe fn vdot_s32(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v2i32.v8i8")]
-        fn vdot_s32_(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t;
-    }
-    vdot_s32_(a, b, c)
-}
-
-/// Dot product arithmetic
-///
-/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_s32)
-#[inline]
-#[target_feature(enable = "neon,dotprod")]
-#[cfg_attr(test, assert_instr(sdot))]
-pub unsafe fn vdotq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v4i32.v16i8")]
-        fn vdotq_s32_(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t;
-    }
-    vdotq_s32_(a, b, c)
-}
-
-/// Dot product arithmetic
-///
-/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_u32)
-#[inline]
-#[target_feature(enable = "neon,dotprod")]
-#[cfg_attr(test, assert_instr(udot))]
-pub unsafe fn vdot_u32(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v2i32.v8i8")]
-        fn vdot_u32_(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t;
-    }
-    vdot_u32_(a, b, c)
-}
-
-/// Dot product arithmetic
-///
-/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_u32)
-#[inline]
-#[target_feature(enable = "neon,dotprod")]
-#[cfg_attr(test, assert_instr(udot))]
-pub unsafe fn vdotq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
-    #[allow(improper_ctypes)]
-    extern "unadjusted" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v4i32.v16i8")]
-        fn vdotq_u32_(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t;
-    }
-    vdotq_u32_(a, b, c)
-}
-
-/// Dot product arithmetic
-///
-/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_s32)
-#[inline]
-#[target_feature(enable = "neon,dotprod")]
-#[cfg_attr(test, assert_instr(sdot, LANE = 0))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vdot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t {
-    static_assert_uimm_bits!(LANE, 1);
-    let c: int8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
-    vdot_s32(a, b, c)
-}
-
-/// Dot product arithmetic
+/// Dot product arithmetic (indexed)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_laneq_s32)
 #[inline]
@@ -10583,24 +10566,12 @@ pub unsafe fn vdot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vdot_laneq_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x16_t) -> int32x2_t {
     static_assert_uimm_bits!(LANE, 2);
-    let c: int8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
-    vdot_s32(a, b, c)
-}
-
-/// Dot product arithmetic
-///
-/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_s32)
-#[inline]
-#[target_feature(enable = "neon,dotprod")]
-#[cfg_attr(test, assert_instr(sdot, LANE = 0))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int8x8_t) -> int32x4_t {
-    static_assert_uimm_bits!(LANE, 1);
-    let c: int8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
-    vdotq_s32(a, b, c)
+    let c: int32x4_t = transmute(c);
+    let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+    vdot_s32(a, b, transmute(c))
 }
 
-/// Dot product arithmetic
+/// Dot product arithmetic (indexed)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_laneq_s32)
 #[inline]
@@ -10609,24 +10580,12 @@ pub unsafe fn vdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vdotq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    let c: int8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
-    vdotq_s32(a, b, c)
-}
-
-/// Dot product arithmetic
-///
-/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_u32)
-#[inline]
-#[target_feature(enable = "neon,dotprod")]
-#[cfg_attr(test, assert_instr(udot, LANE = 0))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vdot_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t {
-    static_assert_uimm_bits!(LANE, 1);
-    let c: uint8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
-    vdot_u32(a, b, c)
+    let c: int32x4_t = transmute(c);
+    let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vdotq_s32(a, b, transmute(c))
 }
 
-/// Dot product arithmetic
+/// Dot product arithmetic (indexed)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_laneq_u32)
 #[inline]
@@ -10635,24 +10594,12 @@ pub unsafe fn vdot_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uin
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vdot_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uint8x16_t) -> uint32x2_t {
     static_assert_uimm_bits!(LANE, 2);
-    let c: uint8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
-    vdot_u32(a, b, c)
-}
-
-/// Dot product arithmetic
-///
-/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_u32)
-#[inline]
-#[target_feature(enable = "neon,dotprod")]
-#[cfg_attr(test, assert_instr(udot, LANE = 0))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vdotq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: uint8x8_t) -> uint32x4_t {
-    static_assert_uimm_bits!(LANE, 1);
-    let c: uint8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
-    vdotq_u32(a, b, c)
+    let c: uint32x4_t = transmute(c);
+    let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+    vdot_u32(a, b, transmute(c))
 }
 
-/// Dot product arithmetic
+/// Dot product arithmetic (indexed)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_laneq_u32)
 #[inline]
@@ -10661,8 +10608,9 @@ pub unsafe fn vdotq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: u
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn vdotq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    let c: uint8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
-    vdotq_u32(a, b, c)
+    let c: uint32x4_t = transmute(c);
+    let c: uint32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vdotq_u32(a, b, transmute(c))
 }
 
 /// Maximum (vector)
@@ -14864,7 +14812,7 @@ pub unsafe fn vrshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) -> u
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsrad_n_s64)
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(srsra, N = 2))]
+#[cfg_attr(test, assert_instr(srshr, N = 2))]
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vrsrad_n_s64<const N: i32>(a: i64, b: i64) -> i64 {
@@ -14873,12 +14821,12 @@ pub unsafe fn vrsrad_n_s64<const N: i32>(a: i64, b: i64) -> i64 {
     a.wrapping_add(b)
 }
 
-/// Ungisned rounding shift right and accumulate.
+/// Unsigned rounding shift right and accumulate.
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrsrad_n_u64)
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(ursra, N = 2))]
+#[cfg_attr(test, assert_instr(urshr, N = 2))]
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vrsrad_n_u64<const N: i32>(a: u64, b: u64) -> u64 {
@@ -15349,6 +15297,36 @@ pub unsafe fn vrnd32xq_f32(a: float32x4_t) -> float32x4_t {
     vrnd32xq_f32_(a)
 }
 
+/// Floating-point round to 32-bit integer, using current rounding mode
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd32xq_f64)
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint32x))]
+pub unsafe fn vrnd32xq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint32x.v2f64")]
+        fn vrnd32xq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrnd32xq_f64_(a)
+}
+
+/// Floating-point round to 32-bit integer, using current rounding mode
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd32x_f64)
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint32x))]
+pub unsafe fn vrnd32x_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.frint32x.f64")]
+        fn vrnd32x_f64_(a: f64) -> f64;
+    }
+    transmute(vrnd32x_f64_(simd_extract(a, 0)))
+}
+
 /// Floating-point round to 32-bit integer toward zero
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd32z_f32)
@@ -15379,6 +15357,36 @@ pub unsafe fn vrnd32zq_f32(a: float32x4_t) -> float32x4_t {
     vrnd32zq_f32_(a)
 }
 
+/// Floating-point round to 32-bit integer toward zero
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd32zq_f64)
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint32z))]
+pub unsafe fn vrnd32zq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint32z.v2f64")]
+        fn vrnd32zq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrnd32zq_f64_(a)
+}
+
+/// Floating-point round to 32-bit integer toward zero
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd32z_f64)
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint32z))]
+pub unsafe fn vrnd32z_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.frint32z.f64")]
+        fn vrnd32z_f64_(a: f64) -> f64;
+    }
+    transmute(vrnd32z_f64_(simd_extract(a, 0)))
+}
+
 /// Floating-point round to 64-bit integer, using current rounding mode
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd64x_f32)
@@ -15409,6 +15417,36 @@ pub unsafe fn vrnd64xq_f32(a: float32x4_t) -> float32x4_t {
     vrnd64xq_f32_(a)
 }
 
+/// Floating-point round to 64-bit integer, using current rounding mode
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd64xq_f64)
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint64x))]
+pub unsafe fn vrnd64xq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint64x.v2f64")]
+        fn vrnd64xq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrnd64xq_f64_(a)
+}
+
+/// Floating-point round to 64-bit integer, using current rounding mode
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd64x_f64)
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint64x))]
+pub unsafe fn vrnd64x_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.frint64x.f64")]
+        fn vrnd64x_f64_(a: f64) -> f64;
+    }
+    transmute(vrnd64x_f64_(simd_extract(a, 0)))
+}
+
 /// Floating-point round to 64-bit integer toward zero
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd64z_f32)
@@ -15439,6 +15477,36 @@ pub unsafe fn vrnd64zq_f32(a: float32x4_t) -> float32x4_t {
     vrnd64zq_f32_(a)
 }
 
+/// Floating-point round to 64-bit integer toward zero
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd64zq_f64)
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint64z))]
+pub unsafe fn vrnd64zq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint64z.v2f64")]
+        fn vrnd64zq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrnd64zq_f64_(a)
+}
+
+/// Floating-point round to 64-bit integer toward zero
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrnd64z_f64)
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint64z))]
+pub unsafe fn vrnd64z_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.frint64z.f64")]
+        fn vrnd64z_f64_(a: f64) -> f64;
+    }
+    transmute(vrnd64z_f64_(simd_extract(a, 0)))
+}
+
 /// Transpose vectors
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtrn1_s8)
@@ -22184,6 +22252,46 @@ mod test {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "neon,i8mm")]
+    unsafe fn test_vusdot_laneq_s32() {
+        let a: i32x2 = i32x2::new(1000, -4200);
+        let b: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
+        let c: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
+        let e: i32x2 = i32x2::new(-3420, -10140);
+        let r: i32x2 = transmute(vusdot_laneq_s32::<3>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,i8mm")]
+    unsafe fn test_vusdotq_laneq_s32() {
+        let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000);
+        let b: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250);
+        let c: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
+        let e: i32x4 = i32x4::new(-3420, -10140, -8460, -6980);
+        let r: i32x4 = transmute(vusdotq_laneq_s32::<3>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,i8mm")]
+    unsafe fn test_vsudot_laneq_s32() {
+        let a: i32x2 = i32x2::new(-2000, 4200);
+        let b: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
+        let c: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250);
+        let e: i32x2 = i32x2::new(300, 2740);
+        let r: i32x2 = transmute(vsudot_laneq_s32::<3>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,i8mm")]
+    unsafe fn test_vsudotq_laneq_s32() {
+        let a: i32x4 = i32x4::new(-2000, 4200, -1000, 2000);
+        let b: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
+        let c: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250);
+        let e: i32x4 = i32x4::new(300, 2740, -6220, -6980);
+        let r: i32x4 = transmute(vsudotq_laneq_s32::<3>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "neon")]
     unsafe fn test_vmul_f64() {
         let a: f64 = 1.0;
@@ -23664,121 +23772,41 @@ mod test {
     }
 
     #[simd_test(enable = "neon,dotprod")]
-    unsafe fn test_vdot_s32() {
-        let a: i32x2 = i32x2::new(1, 2);
-        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: i32x2 = i32x2::new(31, 176);
-        let r: i32x2 = transmute(vdot_s32(transmute(a), transmute(b), transmute(c)));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon,dotprod")]
-    unsafe fn test_vdotq_s32() {
-        let a: i32x4 = i32x4::new(1, 2, 1, 2);
-        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
-        let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
-        let e: i32x4 = i32x4::new(31, 176, 31, 176);
-        let r: i32x4 = transmute(vdotq_s32(transmute(a), transmute(b), transmute(c)));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon,dotprod")]
-    unsafe fn test_vdot_u32() {
-        let a: u32x2 = u32x2::new(1, 2);
-        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u32x2 = u32x2::new(31, 176);
-        let r: u32x2 = transmute(vdot_u32(transmute(a), transmute(b), transmute(c)));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon,dotprod")]
-    unsafe fn test_vdotq_u32() {
-        let a: u32x4 = u32x4::new(1, 2, 1, 2);
-        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
-        let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u32x4 = u32x4::new(31, 176, 31, 176);
-        let r: u32x4 = transmute(vdotq_u32(transmute(a), transmute(b), transmute(c)));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon,dotprod")]
-    unsafe fn test_vdot_lane_s32() {
-        let a: i32x2 = i32x2::new(1, 2);
-        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: i32x2 = i32x2::new(31, 72);
-        let r: i32x2 = transmute(vdot_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon,dotprod")]
     unsafe fn test_vdot_laneq_s32() {
         let a: i32x2 = i32x2::new(1, 2);
-        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(-1, 2, 3, 4, 5, 6, 7, 8);
         let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
-        let e: i32x2 = i32x2::new(31, 72);
+        let e: i32x2 = i32x2::new(29, 72);
         let r: i32x2 = transmute(vdot_laneq_s32::<0>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon,dotprod")]
-    unsafe fn test_vdotq_lane_s32() {
-        let a: i32x4 = i32x4::new(1, 2, 1, 2);
-        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
-        let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: i32x4 = i32x4::new(31, 72, 31, 72);
-        let r: i32x4 = transmute(vdotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon,dotprod")]
     unsafe fn test_vdotq_laneq_s32() {
         let a: i32x4 = i32x4::new(1, 2, 1, 2);
-        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x16 = i8x16::new(-1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
         let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
-        let e: i32x4 = i32x4::new(31, 72, 31, 72);
+        let e: i32x4 = i32x4::new(29, 72, 31, 72);
         let r: i32x4 = transmute(vdotq_laneq_s32::<0>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon,dotprod")]
-    unsafe fn test_vdot_lane_u32() {
-        let a: u32x2 = u32x2::new(1, 2);
-        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u32x2 = u32x2::new(31, 72);
-        let r: u32x2 = transmute(vdot_lane_u32::<0>(transmute(a), transmute(b), transmute(c)));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon,dotprod")]
     unsafe fn test_vdot_laneq_u32() {
         let a: u32x2 = u32x2::new(1, 2);
-        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u8x8 = u8x8::new(255, 2, 3, 4, 5, 6, 7, 8);
         let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u32x2 = u32x2::new(31, 72);
+        let e: u32x2 = u32x2::new(285, 72);
         let r: u32x2 = transmute(vdot_laneq_u32::<0>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon,dotprod")]
-    unsafe fn test_vdotq_lane_u32() {
-        let a: u32x4 = u32x4::new(1, 2, 1, 2);
-        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
-        let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u32x4 = u32x4::new(31, 72, 31, 72);
-        let r: u32x4 = transmute(vdotq_lane_u32::<0>(transmute(a), transmute(b), transmute(c)));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon,dotprod")]
     unsafe fn test_vdotq_laneq_u32() {
         let a: u32x4 = u32x4::new(1, 2, 1, 2);
-        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u8x16 = u8x16::new(255, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
         let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u32x4 = u32x4::new(31, 72, 31, 72);
+        let e: u32x4 = u32x4::new(285, 72, 31, 72);
         let r: u32x4 = transmute(vdotq_laneq_u32::<0>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
@@ -26888,68 +26916,332 @@ mod test {
 
     #[simd_test(enable = "neon,frintts")]
     unsafe fn test_vrnd32x_f32() {
-        let a: f32x2 = f32x2::new(1.1, 1.9);
-        let e: f32x2 = f32x2::new(1.0, 2.0);
+        let a: f32x2 = f32x2::new(-1.5, 2.9);
+        let e: f32x2 = f32x2::new(-2.0, 3.0);
         let r: f32x2 = transmute(vrnd32x_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon,frintts")]
     unsafe fn test_vrnd32xq_f32() {
-        let a: f32x4 = f32x4::new(1.1, 1.9, -1.7, -2.3);
-        let e: f32x4 = f32x4::new(1.0, 2.0, -2.0, -2.0);
+        let a: f32x4 = f32x4::new(-1.5, 2.9, 1.5, -2.5);
+        let e: f32x4 = f32x4::new(-2.0, 3.0, 2.0, -2.0);
         let r: f32x4 = transmute(vrnd32xq_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd32xq_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 2.9);
+        let e: f64x2 = f64x2::new(-2.0, 3.0);
+        let r: f64x2 = transmute(vrnd32xq_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64x2 = f64x2::new(1.5, -2.5);
+        let e: f64x2 = f64x2::new(2.0, -2.0);
+        let r: f64x2 = transmute(vrnd32xq_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64x2 = f64x2::new(2147483647.499999762, 2147483647.5);
+        let e: f64x2 = f64x2::new(2147483647.0, -2147483648.0);
+        let r: f64x2 = transmute(vrnd32xq_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64x2 = f64x2::new(-2147483647.499999762, -2147483648.500000477);
+        let e: f64x2 = f64x2::new(-2147483647.0, -2147483648.0);
+        let r: f64x2 = transmute(vrnd32xq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd32x_f64() {
+        let a: f64 = -1.5;
+        let e: f64 = -2.0;
+        let r: f64 = transmute(vrnd32x_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = 1.5;
+        let e: f64 = 2.0;
+        let r: f64 = transmute(vrnd32x_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = 2147483647.499999762;
+        let e: f64 = 2147483647.0;
+        let r: f64 = transmute(vrnd32x_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = -2147483647.499999762;
+        let e: f64 = -2147483647.0;
+        let r: f64 = transmute(vrnd32x_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = 2.9;
+        let e: f64 = 3.0;
+        let r: f64 = transmute(vrnd32x_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = -2.5;
+        let e: f64 = -2.0;
+        let r: f64 = transmute(vrnd32x_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = 2147483647.5;
+        let e: f64 = -2147483648.0;
+        let r: f64 = transmute(vrnd32x_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = -2147483648.500000477;
+        let e: f64 = -2147483648.0;
+        let r: f64 = transmute(vrnd32x_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
     unsafe fn test_vrnd32z_f32() {
-        let a: f32x2 = f32x2::new(1.1, 1.9);
-        let e: f32x2 = f32x2::new(1.0, 1.0);
+        let a: f32x2 = f32x2::new(-1.5, 2.9);
+        let e: f32x2 = f32x2::new(-1.0, 2.0);
         let r: f32x2 = transmute(vrnd32z_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon,frintts")]
     unsafe fn test_vrnd32zq_f32() {
-        let a: f32x4 = f32x4::new(1.1, 1.9, -1.7, -2.3);
-        let e: f32x4 = f32x4::new(1.0, 1.0, -1.0, -2.0);
+        let a: f32x4 = f32x4::new(-1.5, 2.9, 1.5, -2.5);
+        let e: f32x4 = f32x4::new(-1.0, 2.0, 1.0, -2.0);
         let r: f32x4 = transmute(vrnd32zq_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd32zq_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 2.9);
+        let e: f64x2 = f64x2::new(-1.0, 2.0);
+        let r: f64x2 = transmute(vrnd32zq_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64x2 = f64x2::new(1.5, -2.5);
+        let e: f64x2 = f64x2::new(1.0, -2.0);
+        let r: f64x2 = transmute(vrnd32zq_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64x2 = f64x2::new(2147483647.999999762, 2147483648.0);
+        let e: f64x2 = f64x2::new(2147483647.0, -2147483648.0);
+        let r: f64x2 = transmute(vrnd32zq_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64x2 = f64x2::new(-2147483647.999999762, -2147483649.0);
+        let e: f64x2 = f64x2::new(-2147483647.0, -2147483648.0);
+        let r: f64x2 = transmute(vrnd32zq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd32z_f64() {
+        let a: f64 = -1.5;
+        let e: f64 = -1.0;
+        let r: f64 = transmute(vrnd32z_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = 1.5;
+        let e: f64 = 1.0;
+        let r: f64 = transmute(vrnd32z_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = 2147483647.999999762;
+        let e: f64 = 2147483647.0;
+        let r: f64 = transmute(vrnd32z_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = -2147483647.999999762;
+        let e: f64 = -2147483647.0;
+        let r: f64 = transmute(vrnd32z_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = 2.9;
+        let e: f64 = 2.0;
+        let r: f64 = transmute(vrnd32z_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = -2.5;
+        let e: f64 = -2.0;
+        let r: f64 = transmute(vrnd32z_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = 2147483648.0;
+        let e: f64 = -2147483648.0;
+        let r: f64 = transmute(vrnd32z_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = -2147483649.0;
+        let e: f64 = -2147483648.0;
+        let r: f64 = transmute(vrnd32z_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
     unsafe fn test_vrnd64x_f32() {
-        let a: f32x2 = f32x2::new(1.1, 1.9);
-        let e: f32x2 = f32x2::new(1.0, 2.0);
+        let a: f32x2 = f32x2::new(-1.5, 2.9);
+        let e: f32x2 = f32x2::new(-2.0, 3.0);
         let r: f32x2 = transmute(vrnd64x_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon,frintts")]
     unsafe fn test_vrnd64xq_f32() {
-        let a: f32x4 = f32x4::new(1.1, 1.9, -1.7, -2.3);
-        let e: f32x4 = f32x4::new(1.0, 2.0, -2.0, -2.0);
+        let a: f32x4 = f32x4::new(-1.5, 2.9, 1.5, -2.5);
+        let e: f32x4 = f32x4::new(-2.0, 3.0, 2.0, -2.0);
         let r: f32x4 = transmute(vrnd64xq_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd64xq_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 2.9);
+        let e: f64x2 = f64x2::new(-2.0, 3.0);
+        let r: f64x2 = transmute(vrnd64xq_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64x2 = f64x2::new(1.5, -2.5);
+        let e: f64x2 = f64x2::new(2.0, -2.0);
+        let r: f64x2 = transmute(vrnd64xq_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64x2 = f64x2::new(9223372036854774784.0, 9223372036854775808.0);
+        let e: f64x2 = f64x2::new(9223372036854774784.0, -9223372036854775808.0);
+        let r: f64x2 = transmute(vrnd64xq_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64x2 = f64x2::new(-9223372036854775808.0, -9223372036854777856.0);
+        let e: f64x2 = f64x2::new(-9223372036854775808.0, -9223372036854775808.0);
+        let r: f64x2 = transmute(vrnd64xq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd64x_f64() {
+        let a: f64 = -1.5;
+        let e: f64 = -2.0;
+        let r: f64 = transmute(vrnd64x_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = 1.5;
+        let e: f64 = 2.0;
+        let r: f64 = transmute(vrnd64x_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = 9223372036854774784.0;
+        let e: f64 = 9223372036854774784.0;
+        let r: f64 = transmute(vrnd64x_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = -9223372036854775808.0;
+        let e: f64 = -9223372036854775808.0;
+        let r: f64 = transmute(vrnd64x_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = 2.9;
+        let e: f64 = 3.0;
+        let r: f64 = transmute(vrnd64x_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = -2.5;
+        let e: f64 = -2.0;
+        let r: f64 = transmute(vrnd64x_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = 9223372036854775808.0;
+        let e: f64 = -9223372036854775808.0;
+        let r: f64 = transmute(vrnd64x_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = -9223372036854777856.0;
+        let e: f64 = -9223372036854775808.0;
+        let r: f64 = transmute(vrnd64x_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
     unsafe fn test_vrnd64z_f32() {
-        let a: f32x2 = f32x2::new(1.1, 1.9);
-        let e: f32x2 = f32x2::new(1.0, 1.0);
+        let a: f32x2 = f32x2::new(-1.5, 2.9);
+        let e: f32x2 = f32x2::new(-1.0, 2.0);
         let r: f32x2 = transmute(vrnd64z_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon,frintts")]
     unsafe fn test_vrnd64zq_f32() {
-        let a: f32x4 = f32x4::new(1.1, 1.9, -1.7, -2.3);
-        let e: f32x4 = f32x4::new(1.0, 1.0, -1.0, -2.0);
+        let a: f32x4 = f32x4::new(-1.5, 2.9, 1.5, -2.5);
+        let e: f32x4 = f32x4::new(-1.0, 2.0, 1.0, -2.0);
         let r: f32x4 = transmute(vrnd64zq_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd64zq_f64() {
+        let a: f64x2 = f64x2::new(-1.5, 2.9);
+        let e: f64x2 = f64x2::new(-1.0, 2.0);
+        let r: f64x2 = transmute(vrnd64zq_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64x2 = f64x2::new(1.5, -2.5);
+        let e: f64x2 = f64x2::new(1.0, -2.0);
+        let r: f64x2 = transmute(vrnd64zq_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64x2 = f64x2::new(9223372036854774784.0, 9223372036854775808.0);
+        let e: f64x2 = f64x2::new(9223372036854774784.0, -9223372036854775808.0);
+        let r: f64x2 = transmute(vrnd64zq_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64x2 = f64x2::new(-9223372036854775808.0, -9223372036854777856.0);
+        let e: f64x2 = f64x2::new(-9223372036854775808.0, -9223372036854775808.0);
+        let r: f64x2 = transmute(vrnd64zq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd64z_f64() {
+        let a: f64 = -1.5;
+        let e: f64 = -1.0;
+        let r: f64 = transmute(vrnd64z_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = 1.5;
+        let e: f64 = 1.0;
+        let r: f64 = transmute(vrnd64z_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = 9223372036854774784.0;
+        let e: f64 = 9223372036854774784.0;
+        let r: f64 = transmute(vrnd64z_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = -9223372036854775808.0;
+        let e: f64 = -9223372036854775808.0;
+        let r: f64 = transmute(vrnd64z_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = 2.9;
+        let e: f64 = 2.0;
+        let r: f64 = transmute(vrnd64z_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = -2.5;
+        let e: f64 = -2.0;
+        let r: f64 = transmute(vrnd64z_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = 9223372036854775808.0;
+        let e: f64 = -9223372036854775808.0;
+        let r: f64 = transmute(vrnd64z_f64(transmute(a)));
+        assert_eq!(r, e);
+
+        let a: f64 = -9223372036854777856.0;
+        let e: f64 = -9223372036854775808.0;
+        let r: f64 = transmute(vrnd64z_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "neon")]
     unsafe fn test_vtrn1_s8() {
         let a: i8x8 = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
index 850657033..30fa21dd8 100644
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
@@ -4127,11 +4127,11 @@ mod tests {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vpminq_s8() {
-        #[cfg_attr(rustfmt, skip)]
+        #[rustfmt::skip]
         let a = i8x16::new(1, -2, 3, -4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
-        #[cfg_attr(rustfmt, skip)]
+        #[rustfmt::skip]
         let b = i8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
-        #[cfg_attr(rustfmt, skip)]
+        #[rustfmt::skip]
         let e = i8x16::new(-2, -4, 5, 7, 1, 3, 5, 7, 0, 2, 4, 6, 0, 2, 4, 6);
         let r: i8x16 = transmute(vpminq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -4157,11 +4157,11 @@ mod tests {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vpminq_u8() {
-        #[cfg_attr(rustfmt, skip)]
+        #[rustfmt::skip]
         let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
-        #[cfg_attr(rustfmt, skip)]
+        #[rustfmt::skip]
         let b = u8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
-        #[cfg_attr(rustfmt, skip)]
+        #[rustfmt::skip]
         let e = u8x16::new(1, 3, 5, 7, 1, 3, 5, 7, 0, 2, 4, 6, 0, 2, 4, 6);
         let r: u8x16 = transmute(vpminq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -4205,11 +4205,11 @@ mod tests {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vpmaxq_s8() {
-        #[cfg_attr(rustfmt, skip)]
+        #[rustfmt::skip]
         let a = i8x16::new(1, -2, 3, -4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
-        #[cfg_attr(rustfmt, skip)]
+        #[rustfmt::skip]
         let b = i8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
-        #[cfg_attr(rustfmt, skip)]
+        #[rustfmt::skip]
         let e = i8x16::new(1, 3, 6, 8, 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9);
         let r: i8x16 = transmute(vpmaxq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -4235,11 +4235,11 @@ mod tests {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vpmaxq_u8() {
-        #[cfg_attr(rustfmt, skip)]
+        #[rustfmt::skip]
         let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
-        #[cfg_attr(rustfmt, skip)]
+        #[rustfmt::skip]
         let b = u8x16::new(0, 3, 2, 5, 4, 7, 6, 9, 0, 3, 2, 5, 4, 7, 6, 9);
-        #[cfg_attr(rustfmt, skip)]
+        #[rustfmt::skip]
         let e = u8x16::new(2, 4, 6, 8, 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9);
         let r: u8x16 = transmute(vpmaxq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
diff --git a/library/stdarch/crates/core_arch/src/aarch64/tme.rs b/library/stdarch/crates/core_arch/src/aarch64/tme.rs
index 05df313e4..15f1b877d 100644
--- a/library/stdarch/crates/core_arch/src/aarch64/tme.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/tme.rs
@@ -21,9 +21,9 @@ extern "unadjusted" {
     #[link_name = "llvm.aarch64.tstart"]
     fn aarch64_tstart() -> u64;
     #[link_name = "llvm.aarch64.tcommit"]
-    fn aarch64_tcommit() -> ();
+    fn aarch64_tcommit();
     #[link_name = "llvm.aarch64.tcancel"]
-    fn aarch64_tcancel(imm0: u64) -> ();
+    fn aarch64_tcancel(imm0: u64);
     #[link_name = "llvm.aarch64.ttest"]
     fn aarch64_ttest() -> u64;
 }
diff --git a/library/stdarch/crates/core_arch/src/aarch64/v8.rs b/library/stdarch/crates/core_arch/src/aarch64/v8.rs
deleted file mode 100644
index 778721c68..000000000
--- a/library/stdarch/crates/core_arch/src/aarch64/v8.rs
+++ /dev/null
@@ -1,104 +0,0 @@
-//! ARMv8 intrinsics.
-//!
-//! The reference is [ARMv8-A Reference Manual][armv8].
-//!
-//! [armv8]: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.
-//! ddi0487a.k_10775/index.html
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Reverse the order of the bytes.
-#[inline]
-#[cfg_attr(test, assert_instr(rev))]
-pub unsafe fn _rev_u64(x: u64) -> u64 {
-    x.swap_bytes() as u64
-}
-
-/// Count Leading Zeros.
-#[inline]
-#[cfg_attr(test, assert_instr(clz))]
-pub unsafe fn _clz_u64(x: u64) -> u64 {
-    x.leading_zeros() as u64
-}
-
-/// Reverse the bit order.
-#[inline]
-#[cfg_attr(test, assert_instr(rbit))]
-pub unsafe fn _rbit_u64(x: u64) -> u64 {
-    crate::intrinsics::bitreverse(x)
-}
-
-/// Counts the leading most significant bits set.
-///
-/// When all bits of the operand are set it returns the size of the operand in
-/// bits.
-#[inline]
-#[cfg_attr(test, assert_instr(cls))]
-pub unsafe fn _cls_u32(x: u32) -> u32 {
-    u32::leading_zeros((((((x as i32) >> 31) as u32) ^ x) << 1) | 1) as u32
-}
-
-/// Counts the leading most significant bits set.
-///
-/// When all bits of the operand are set it returns the size of the operand in
-/// bits.
-#[inline]
-#[cfg_attr(test, assert_instr(cls))]
-pub unsafe fn _cls_u64(x: u64) -> u64 {
-    u64::leading_zeros((((((x as i64) >> 63) as u64) ^ x) << 1) | 1) as u64
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::core_arch::aarch64::v8;
-
-    #[test]
-    fn _rev_u64() {
-        unsafe {
-            assert_eq!(
-                v8::_rev_u64(0b0000_0000_1111_1111_0000_0000_1111_1111_u64),
-                0b1111_1111_0000_0000_1111_1111_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_u64
-            );
-        }
-    }
-
-    #[test]
-    fn _clz_u64() {
-        unsafe {
-            assert_eq!(v8::_clz_u64(0b0000_1010u64), 60u64);
-        }
-    }
-
-    #[test]
-    fn _rbit_u64() {
-        unsafe {
-            assert_eq!(
-                v8::_rbit_u64(0b0000_0000_1111_1101_0000_0000_1111_1111_u64),
-                0b1111_1111_0000_0000_1011_1111_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_u64
-            );
-        }
-    }
-
-    #[test]
-    fn _cls_u32() {
-        unsafe {
-            assert_eq!(
-                v8::_cls_u32(0b1111_1111_1111_1111_0000_0000_1111_1111_u32),
-                15_u32
-            );
-        }
-    }
-
-    #[test]
-    fn _cls_u64() {
-        unsafe {
-            assert_eq!(
-                v8::_cls_u64(
-                    0b1111_1111_1111_1111_0000_0000_1111_1111_0000_0000_0000_0000_0000_0000_0000_0000_u64
-                ),
-                15_u64
-            );
-        }
-    }
-}
diff --git a/library/stdarch/crates/core_arch/src/arm/armclang.rs b/library/stdarch/crates/core_arch/src/arm/armclang.rs
deleted file mode 100644
index e44ee2f4a..000000000
--- a/library/stdarch/crates/core_arch/src/arm/armclang.rs
+++ /dev/null
@@ -1,35 +0,0 @@
-//! ARM compiler specific intrinsics
-//!
-//! # References
-//!
-//! - [ARM Compiler v 6.10 - armclang Reference Guide][arm_comp_ref]
-//!
-//! [arm_comp_ref]: https://developer.arm.com/docs/100067/0610
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Inserts a breakpoint instruction.
-///
-/// `VAL` is a compile-time constant integer in range `[0, 255]`.
-///
-/// The breakpoint instruction inserted is `BKPT` on A32/T32.
-///
-/// # Note
-///
-/// [ARM's documentation][arm_docs] defines that `__breakpoint` accepts the
-/// following values for `VAL`:
-///
-/// - `0...65535` when compiling as A32,
-/// - `0...255` when compiling as T32.
-///
-/// The current implementation only accepts values in range `[0, 255]`.
-///
-/// [arm_docs]: https://developer.arm.com/docs/100067/latest/compiler-specific-intrinsics/__breakpoint-intrinsic
-#[cfg_attr(test, assert_instr(bkpt, VAL = 0))]
-#[inline(always)]
-#[rustc_legacy_const_generics(0)]
-pub unsafe fn __breakpoint<const VAL: i32>() {
-    static_assert_uimm_bits!(VAL, 8);
-    crate::arch::asm!("bkpt #{}", const VAL);
-}
diff --git a/library/stdarch/crates/core_arch/src/arm/ex.rs b/library/stdarch/crates/core_arch/src/arm/ex.rs
deleted file mode 100644
index 75f378642..000000000
--- a/library/stdarch/crates/core_arch/src/arm/ex.rs
+++ /dev/null
@@ -1,125 +0,0 @@
-// Reference: Section 5.4.4 "LDREX / STREX" of ACLE
-
-/// Removes the exclusive lock created by LDREX
-// Supported: v6, v6K, v7-M, v7-A, v7-R
-// Not supported: v5, v6-M
-// NOTE: there's no dedicated CLREX instruction in v6 (<v6k); to clear the exclusive monitor users
-// have to do a dummy STREX operation
-#[cfg(any(
-    all(target_feature = "v6k", not(target_feature = "mclass")), // excludes v6-M
-    all(target_feature = "v7", target_feature = "mclass"), // v7-M
-    doc
-))]
-pub unsafe fn __clrex() {
-    extern "unadjusted" {
-        #[link_name = "llvm.arm.clrex"]
-        fn clrex();
-    }
-
-    clrex()
-}
-
-/// Executes an exclusive LDR instruction for 8 bit value.
-// Supported: v6K, v7-M, v7-A, v7-R
-// Not supported: v5, v6, v6-M
-#[cfg(any(
-    target_feature = "v6k", // includes v7-M but excludes v6-M
-    doc
-))]
-pub unsafe fn __ldrexb(p: *const u8) -> u8 {
-    extern "unadjusted" {
-        #[link_name = "llvm.arm.ldrex.p0i8"]
-        fn ldrex8(p: *const u8) -> u32;
-    }
-
-    ldrex8(p) as u8
-}
-
-/// Executes an exclusive LDR instruction for 16 bit value.
-// Supported: v6K, v7-M, v7-A, v7-R, v8
-// Not supported: v5, v6, v6-M
-#[cfg(any(
-    target_feature = "v6k", // includes v7-M but excludes v6-M
-    doc
-))]
-pub unsafe fn __ldrexh(p: *const u16) -> u16 {
-    extern "unadjusted" {
-        #[link_name = "llvm.arm.ldrex.p0i16"]
-        fn ldrex16(p: *const u16) -> u32;
-    }
-
-    ldrex16(p) as u16
-}
-
-/// Executes an exclusive LDR instruction for 32 bit value.
-// Supported: v6, v7-M, v6K, v7-A, v7-R, v8
-// Not supported: v5, v6-M
-#[cfg(any(
-    all(target_feature = "v6", not(target_feature = "mclass")), // excludes v6-M
-    all(target_feature = "v7", target_feature = "mclass"), // v7-M
-    doc
-))]
-pub unsafe fn __ldrex(p: *const u32) -> u32 {
-    extern "unadjusted" {
-        #[link_name = "llvm.arm.ldrex.p0i32"]
-        fn ldrex32(p: *const u32) -> u32;
-    }
-
-    ldrex32(p)
-}
-
-/// Executes an exclusive STR instruction for 8 bit values
-///
-/// Returns `0` if the operation succeeded, or `1` if it failed
-// supported: v6K, v7-M, v7-A, v7-R
-// Not supported: v5, v6, v6-M
-#[cfg(any(
-    target_feature = "v6k", // includes v7-M but excludes v6-M
-    doc
-))]
-pub unsafe fn __strexb(value: u32, addr: *mut u8) -> u32 {
-    extern "unadjusted" {
-        #[link_name = "llvm.arm.strex.p0i8"]
-        fn strex8(value: u32, addr: *mut u8) -> u32;
-    }
-
-    strex8(value, addr)
-}
-
-/// Executes an exclusive STR instruction for 16 bit values
-///
-/// Returns `0` if the operation succeeded, or `1` if it failed
-// Supported: v6K, v7-M, v7-A, v7-R, v8
-// Not supported: v5, v6, v6-M
-#[cfg(target_feature = "aarch64")]
-#[cfg(any(
-    target_feature = "v6k", // includes v7-M but excludes v6-M
-    doc
-))]
-pub unsafe fn __strexh(value: u16, addr: *mut u16) -> u32 {
-    extern "unadjusted" {
-        #[link_name = "llvm.arm.strex.p0i16"]
-        fn strex16(value: u32, addr: *mut u16) -> u32;
-    }
-
-    strex16(value as u32, addr)
-}
-
-/// Executes an exclusive STR instruction for 32 bit values
-///
-/// Returns `0` if the operation succeeded, or `1` if it failed
-// Supported: v6, v7-M, v6K, v7-A, v7-R, v8
-// Not supported: v5, v6-M
-#[cfg(any(
-    all(target_feature = "v6", not(target_feature = "mclass")), // excludes v6-M
-    all(target_feature = "v7", target_feature = "mclass"), // v7-M
-    doc
-))]
-pub unsafe fn __strex(value: u32, addr: *mut u32) -> u32 {
-    extern "unadjusted" {
-        #[link_name = "llvm.arm.strex.p0i32"]
-        fn strex32(value: u32, addr: *mut u32) -> u32;
-    }
-
-    strex32(value, addr)
-}
diff --git a/library/stdarch/crates/core_arch/src/arm/mod.rs b/library/stdarch/crates/core_arch/src/arm/mod.rs
index ec91e5de5..9cc75a3cc 100644
--- a/library/stdarch/crates/core_arch/src/arm/mod.rs
+++ b/library/stdarch/crates/core_arch/src/arm/mod.rs
@@ -6,12 +6,6 @@
 //! [arm_ref]: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
 //! [arm_dat]: https://developer.arm.com/technologies/neon/intrinsics
 
-mod armclang;
-pub use self::armclang::*;
-
-mod v6;
-pub use self::v6::*;
-
 // Supported arches: 6, 7-M. See Section 10.1 of ACLE (e.g. SSAT)
 #[cfg(any(target_feature = "v6", doc))]
 mod sat;
@@ -62,14 +56,6 @@ mod simd32;
 ))]
 pub use self::simd32::*;
 
-#[cfg(any(target_feature = "v7", doc))]
-mod v7;
-#[cfg(any(target_feature = "v7", doc))]
-pub use self::v7::*;
-
-mod ex;
-pub use self::ex::*;
-
 pub use crate::core_arch::arm_shared::*;
 
 #[cfg(test)]
diff --git a/library/stdarch/crates/core_arch/src/arm/neon.rs b/library/stdarch/crates/core_arch/src/arm/neon.rs
index e1de48538..75d3f19e8 100644
--- a/library/stdarch/crates/core_arch/src/arm/neon.rs
+++ b/library/stdarch/crates/core_arch/src/arm/neon.rs
@@ -1,16 +1,9 @@
 use crate::core_arch::arm_shared::neon::*;
-use crate::core_arch::simd::{f32x4, i32x4, u32x4};
-use crate::core_arch::simd_llvm::*;
 use crate::mem::{align_of, transmute};
 
 #[cfg(test)]
 use stdarch_test::assert_instr;
 
-#[allow(non_camel_case_types)]
-pub(crate) type p8 = u8;
-#[allow(non_camel_case_types)]
-pub(crate) type p16 = u16;
-
 #[allow(improper_ctypes)]
 extern "unadjusted" {
     #[link_name = "llvm.arm.neon.vbsl.v8i8"]
@@ -794,27 +787,6 @@ pub unsafe fn vtbx4_p8(a: poly8x8_t, b: poly8x8x4_t, c: uint8x8_t) -> poly8x8_t
     ))
 }
 
-// These float-to-int implementations have undefined behaviour when `a` overflows
-// the destination type. Clang has the same problem: https://llvm.org/PR47510
-
-/// Floating-point Convert to Signed fixed-point, rounding toward Zero (vector)
-#[inline]
-#[target_feature(enable = "neon")]
-#[target_feature(enable = "v7")]
-#[cfg_attr(test, assert_instr("vcvt.s32.f32"))]
-pub unsafe fn vcvtq_s32_f32(a: float32x4_t) -> int32x4_t {
-    transmute(simd_cast::<_, i32x4>(transmute::<_, f32x4>(a)))
-}
-
-/// Floating-point Convert to Unsigned fixed-point, rounding toward Zero (vector)
-#[inline]
-#[target_feature(enable = "neon")]
-#[target_feature(enable = "v7")]
-#[cfg_attr(test, assert_instr("vcvt.u32.f32"))]
-pub unsafe fn vcvtq_u32_f32(a: float32x4_t) -> uint32x4_t {
-    transmute(simd_cast::<_, u32x4>(transmute::<_, f32x4>(a)))
-}
-
 /// Shift Left and Insert (immediate)
 #[inline]
 #[target_feature(enable = "neon,v7")]
diff --git a/library/stdarch/crates/core_arch/src/arm/v6.rs b/library/stdarch/crates/core_arch/src/arm/v6.rs
deleted file mode 100644
index 5df30cd62..000000000
--- a/library/stdarch/crates/core_arch/src/arm/v6.rs
+++ /dev/null
@@ -1,49 +0,0 @@
-//! ARMv6 intrinsics.
-//!
-//! The reference is [ARMv6-M Architecture Reference Manual][armv6m].
-//!
-//! [armv6m]:
-//! http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0419c/index.
-//! html
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Reverse the order of the bytes.
-#[inline]
-#[cfg_attr(test, assert_instr(rev))]
-pub unsafe fn _rev_u16(x: u16) -> u16 {
-    x.swap_bytes() as u16
-}
-
-/// Reverse the order of the bytes.
-#[inline]
-#[cfg_attr(test, assert_instr(rev))]
-pub unsafe fn _rev_u32(x: u32) -> u32 {
-    x.swap_bytes() as u32
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::core_arch::arm::v6;
-
-    #[test]
-    fn _rev_u16() {
-        unsafe {
-            assert_eq!(
-                v6::_rev_u16(0b0000_0000_1111_1111_u16),
-                0b1111_1111_0000_0000_u16
-            );
-        }
-    }
-
-    #[test]
-    fn _rev_u32() {
-        unsafe {
-            assert_eq!(
-                v6::_rev_u32(0b0000_0000_1111_1111_0000_0000_1111_1111_u32),
-                0b1111_1111_0000_0000_1111_1111_0000_0000_u32
-            );
-        }
-    }
-}
diff --git a/library/stdarch/crates/core_arch/src/arm/v7.rs b/library/stdarch/crates/core_arch/src/arm/v7.rs
deleted file mode 100644
index 59beaf722..000000000
--- a/library/stdarch/crates/core_arch/src/arm/v7.rs
+++ /dev/null
@@ -1,87 +0,0 @@
-//! ARMv7 intrinsics.
-//!
-//! The reference is [ARMv7-M Architecture Reference Manual (Issue
-//! E.b)][armv7m].
-//!
-//! [armv7m]:
-//! http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0403e.
-//! b/index.html
-
-pub use super::v6::*;
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Count Leading Zeros.
-#[inline]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
-// FIXME: https://github.com/rust-lang/stdarch/issues/382
-// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))]
-pub unsafe fn _clz_u8(x: u8) -> u8 {
-    x.leading_zeros() as u8
-}
-
-/// Count Leading Zeros.
-#[inline]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
-// FIXME: https://github.com/rust-lang/stdarch/issues/382
-// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))]
-pub unsafe fn _clz_u16(x: u16) -> u16 {
-    x.leading_zeros() as u16
-}
-
-/// Count Leading Zeros.
-#[inline]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
-// FIXME: https://github.com/rust-lang/stdarch/issues/382
-// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))]
-pub unsafe fn _clz_u32(x: u32) -> u32 {
-    x.leading_zeros() as u32
-}
-
-/// Reverse the bit order.
-#[inline]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(test, assert_instr(rbit))]
-pub unsafe fn _rbit_u32(x: u32) -> u32 {
-    crate::intrinsics::bitreverse(x)
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::core_arch::arm::v7;
-
-    #[test]
-    fn _clz_u8() {
-        unsafe {
-            assert_eq!(v7::_clz_u8(0b0000_1010u8), 4u8);
-        }
-    }
-
-    #[test]
-    fn _clz_u16() {
-        unsafe {
-            assert_eq!(v7::_clz_u16(0b0000_1010u16), 12u16);
-        }
-    }
-
-    #[test]
-    fn _clz_u32() {
-        unsafe {
-            assert_eq!(v7::_clz_u32(0b0000_1010u32), 28u32);
-        }
-    }
-
-    #[test]
-    fn _rbit_u32() {
-        unsafe {
-            assert_eq!(
-                v7::_rbit_u32(0b0000_1010u32),
-                0b0101_0000_0000_0000_0000_0000_0000_0000u32
-            );
-        }
-    }
-}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/mod.rs b/library/stdarch/crates/core_arch/src/arm_shared/mod.rs
index 4c8d19854..fc6617f5a 100644
--- a/library/stdarch/crates/core_arch/src/arm_shared/mod.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/mod.rs
@@ -59,9 +59,6 @@ pub use self::barrier::*;
 mod hints;
 pub use self::hints::*;
 
-mod registers;
-pub use self::registers::*;
-
 #[cfg(any(target_arch = "aarch64", target_feature = "v7", doc))]
 mod crc;
 #[cfg(any(target_arch = "aarch64", target_feature = "v7", doc))]
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
index 775811e65..34dc3a334 100644
--- a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
@@ -1532,7 +1532,7 @@ pub unsafe fn vcgtq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
     simd_gt(a, b)
 }
 
-/// Compare unsigned highe
+/// Compare unsigned greater than
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgt_u8)
 #[inline]
@@ -1545,7 +1545,7 @@ pub unsafe fn vcgt_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
     simd_gt(a, b)
 }
 
-/// Compare unsigned highe
+/// Compare unsigned greater than
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtq_u8)
 #[inline]
@@ -1558,7 +1558,7 @@ pub unsafe fn vcgtq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
     simd_gt(a, b)
 }
 
-/// Compare unsigned highe
+/// Compare unsigned greater than
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgt_u16)
 #[inline]
@@ -1571,7 +1571,7 @@ pub unsafe fn vcgt_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
     simd_gt(a, b)
 }
 
-/// Compare unsigned highe
+/// Compare unsigned greater than
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtq_u16)
 #[inline]
@@ -1584,7 +1584,7 @@ pub unsafe fn vcgtq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
     simd_gt(a, b)
 }
 
-/// Compare unsigned highe
+/// Compare unsigned greater than
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgt_u32)
 #[inline]
@@ -1597,7 +1597,7 @@ pub unsafe fn vcgt_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
     simd_gt(a, b)
 }
 
-/// Compare unsigned highe
+/// Compare unsigned greater than
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcgtq_u32)
 #[inline]
@@ -2888,7 +2888,7 @@ vcvt_n_f32_s32_(a, N)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_f32_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(scvtf, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -2925,7 +2925,7 @@ vcvtq_n_f32_s32_(a, N)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_f32_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(scvtf, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -2962,7 +2962,7 @@ vcvt_n_f32_u32_(a, N)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_f32_u32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ucvtf, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -2999,7 +2999,7 @@ vcvtq_n_f32_u32_(a, N)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_f32_u32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ucvtf, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -3036,7 +3036,7 @@ vcvt_n_s32_f32_(a, N)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_s32_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fcvtzs, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -3073,7 +3073,7 @@ vcvtq_n_s32_f32_(a, N)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_s32_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fcvtzs, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -3110,7 +3110,7 @@ vcvt_n_u32_f32_(a, N)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvt_n_u32_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fcvtzu, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -3147,7 +3147,7 @@ vcvtq_n_u32_f32_(a, N)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vcvtq_n_u32_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fcvtzu, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -8548,7 +8548,7 @@ vld2_s8_(a as *const i8, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -8581,7 +8581,7 @@ vld2_s16_(a as *const i8, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -8614,7 +8614,7 @@ vld2_s32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -8647,7 +8647,7 @@ vld2q_s8_(a as *const i8, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -8680,7 +8680,7 @@ vld2q_s16_(a as *const i8, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -8713,7 +8713,7 @@ vld2q_s32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -8746,7 +8746,7 @@ vld2_s64_(a as *const i8, 8)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_s64)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(nop))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -8935,7 +8935,7 @@ vld2_f32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -8968,7 +8968,7 @@ vld2q_f32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -9001,7 +9001,7 @@ vld2_dup_s8_(a as *const i8, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -9034,7 +9034,7 @@ vld2_dup_s16_(a as *const i8, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -9067,7 +9067,7 @@ vld2_dup_s32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -9100,7 +9100,7 @@ vld2q_dup_s8_(a as *const i8, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -9133,7 +9133,7 @@ vld2q_dup_s16_(a as *const i8, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -9166,7 +9166,7 @@ vld2q_dup_s32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -9199,7 +9199,7 @@ vld2_dup_s64_(a as *const i8, 8)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_s64)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -9388,7 +9388,7 @@ vld2_dup_f32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_dup_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -9421,7 +9421,7 @@ vld2q_dup_f32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_dup_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -9456,7 +9456,7 @@ vld2_lane_s8_(a as _, b.0, b.1, LANE, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -9493,7 +9493,7 @@ vld2_lane_s16_(a as _, b.0, b.1, LANE, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -9530,7 +9530,7 @@ vld2_lane_s32_(a as _, b.0, b.1, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -9567,7 +9567,7 @@ vld2q_lane_s16_(a as _, b.0, b.1, LANE, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -9604,7 +9604,7 @@ vld2q_lane_s32_(a as _, b.0, b.1, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -9761,7 +9761,7 @@ vld2_lane_f32_(a as _, b.0, b.1, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -9798,7 +9798,7 @@ vld2q_lane_f32_(a as _, b.0, b.1, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2q_lane_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld2, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -9833,7 +9833,7 @@ vld3_s8_(a as *const i8, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -9866,7 +9866,7 @@ vld3_s16_(a as *const i8, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -9899,7 +9899,7 @@ vld3_s32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -9932,7 +9932,7 @@ vld3q_s8_(a as *const i8, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -9965,7 +9965,7 @@ vld3q_s16_(a as *const i8, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -9998,7 +9998,7 @@ vld3q_s32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -10031,7 +10031,7 @@ vld3_s64_(a as *const i8, 8)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s64)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(nop))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -10220,7 +10220,7 @@ vld3_f32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -10253,7 +10253,7 @@ vld3q_f32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -10286,7 +10286,7 @@ vld3_dup_s8_(a as *const i8, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -10319,7 +10319,7 @@ vld3_dup_s16_(a as *const i8, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -10352,7 +10352,7 @@ vld3_dup_s32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -10385,7 +10385,7 @@ vld3q_dup_s8_(a as *const i8, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -10418,7 +10418,7 @@ vld3q_dup_s16_(a as *const i8, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -10451,7 +10451,7 @@ vld3q_dup_s32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -10484,7 +10484,7 @@ vld3_dup_s64_(a as *const i8, 8)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_s64)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -10673,7 +10673,7 @@ vld3_dup_f32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_dup_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -10706,7 +10706,7 @@ vld3q_dup_f32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_dup_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -10741,7 +10741,7 @@ vld3_lane_s8_(a as _, b.0, b.1, b.2, LANE, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -10778,7 +10778,7 @@ vld3_lane_s16_(a as _, b.0, b.1, b.2, LANE, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -10815,7 +10815,7 @@ vld3_lane_s32_(a as _, b.0, b.1, b.2, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -10852,7 +10852,7 @@ vld3q_lane_s16_(a as _, b.0, b.1, b.2, LANE, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -10889,7 +10889,7 @@ vld3q_lane_s32_(a as _, b.0, b.1, b.2, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -11046,7 +11046,7 @@ vld3_lane_f32_(a as _, b.0, b.1, b.2, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -11083,7 +11083,7 @@ vld3q_lane_f32_(a as _, b.0, b.1, b.2, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_lane_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld3, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -11118,7 +11118,7 @@ vld4_s8_(a as *const i8, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -11151,7 +11151,7 @@ vld4_s16_(a as *const i8, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -11184,7 +11184,7 @@ vld4_s32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -11217,7 +11217,7 @@ vld4q_s8_(a as *const i8, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -11250,7 +11250,7 @@ vld4q_s16_(a as *const i8, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -11283,7 +11283,7 @@ vld4q_s32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -11316,7 +11316,7 @@ vld4_s64_(a as *const i8, 8)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s64)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(nop))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -11505,7 +11505,7 @@ vld4_f32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -11538,7 +11538,7 @@ vld4q_f32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -11571,7 +11571,7 @@ vld4_dup_s8_(a as *const i8, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -11604,7 +11604,7 @@ vld4_dup_s16_(a as *const i8, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -11637,7 +11637,7 @@ vld4_dup_s32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -11670,7 +11670,7 @@ vld4q_dup_s8_(a as *const i8, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -11703,7 +11703,7 @@ vld4q_dup_s16_(a as *const i8, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -11736,7 +11736,7 @@ vld4q_dup_s32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -11769,7 +11769,7 @@ vld4_dup_s64_(a as *const i8, 8)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_s64)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -11958,7 +11958,7 @@ vld4_dup_f32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_dup_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -11991,7 +11991,7 @@ vld4q_dup_f32_(a as *const i8, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_dup_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4r))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -12026,7 +12026,7 @@ vld4_lane_s8_(a as _, b.0, b.1, b.2, b.3, LANE, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -12063,7 +12063,7 @@ vld4_lane_s16_(a as _, b.0, b.1, b.2, b.3, LANE, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -12100,7 +12100,7 @@ vld4_lane_s32_(a as _, b.0, b.1, b.2, b.3, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -12137,7 +12137,7 @@ vld4q_lane_s16_(a as _, b.0, b.1, b.2, b.3, LANE, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -12174,7 +12174,7 @@ vld4q_lane_s32_(a as _, b.0, b.1, b.2, b.3, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -12331,7 +12331,7 @@ vld4_lane_f32_(a as _, b.0, b.1, b.2, b.3, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -12368,7 +12368,7 @@ vld4q_lane_f32_(a as _, b.0, b.1, b.2, b.3, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_lane_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(ld4, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -12763,7 +12763,7 @@ vst1_s8_x2_(a, b.0, b.1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s8_x2)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -12796,7 +12796,7 @@ vst1_s16_x2_(a, b.0, b.1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s16_x2)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -12829,7 +12829,7 @@ vst1_s32_x2_(a, b.0, b.1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s32_x2)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -12862,7 +12862,7 @@ vst1_s64_x2_(a, b.0, b.1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s64_x2)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -12895,7 +12895,7 @@ vst1q_s8_x2_(a, b.0, b.1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s8_x2)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -12928,7 +12928,7 @@ vst1q_s16_x2_(a, b.0, b.1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s16_x2)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -12961,7 +12961,7 @@ vst1q_s32_x2_(a, b.0, b.1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s32_x2)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -12994,7 +12994,7 @@ vst1q_s64_x2_(a, b.0, b.1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s64_x2)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -13027,7 +13027,7 @@ vst1_s8_x3_(a, b.0, b.1, b.2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s8_x3)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -13060,7 +13060,7 @@ vst1_s16_x3_(a, b.0, b.1, b.2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s16_x3)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -13093,7 +13093,7 @@ vst1_s32_x3_(a, b.0, b.1, b.2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s32_x3)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -13126,7 +13126,7 @@ vst1_s64_x3_(a, b.0, b.1, b.2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s64_x3)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -13159,7 +13159,7 @@ vst1q_s8_x3_(a, b.0, b.1, b.2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s8_x3)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -13192,7 +13192,7 @@ vst1q_s16_x3_(a, b.0, b.1, b.2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s16_x3)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -13225,7 +13225,7 @@ vst1q_s32_x3_(a, b.0, b.1, b.2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s32_x3)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -13258,7 +13258,7 @@ vst1q_s64_x3_(a, b.0, b.1, b.2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s64_x3)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -13291,7 +13291,7 @@ vst1_s8_x4_(a, b.0, b.1, b.2, b.3)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s8_x4)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -13324,7 +13324,7 @@ vst1_s16_x4_(a, b.0, b.1, b.2, b.3)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s16_x4)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -13357,7 +13357,7 @@ vst1_s32_x4_(a, b.0, b.1, b.2, b.3)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s32_x4)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -13390,7 +13390,7 @@ vst1_s64_x4_(a, b.0, b.1, b.2, b.3)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_s64_x4)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -13423,7 +13423,7 @@ vst1q_s8_x4_(a, b.0, b.1, b.2, b.3)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s8_x4)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -13456,7 +13456,7 @@ vst1q_s16_x4_(a, b.0, b.1, b.2, b.3)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s16_x4)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -13489,7 +13489,7 @@ vst1q_s32_x4_(a, b.0, b.1, b.2, b.3)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s32_x4)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -13522,7 +13522,7 @@ vst1q_s64_x4_(a, b.0, b.1, b.2, b.3)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_s64_x4)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -14101,7 +14101,7 @@ vst1_f32_x2_(a, b.0, b.1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f32_x2)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -14134,7 +14134,7 @@ vst1q_f32_x2_(a, b.0, b.1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f32_x2)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -14167,7 +14167,7 @@ vst1_f32_x3_(a, b.0, b.1, b.2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f32_x3)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -14200,7 +14200,7 @@ vst1q_f32_x3_(a, b.0, b.1, b.2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f32_x3)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -14233,7 +14233,7 @@ vst1_f32_x4_(a, b.0, b.1, b.2, b.3)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1_f32_x4)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -14266,7 +14266,7 @@ vst1q_f32_x4_(a, b.0, b.1, b.2, b.3)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst1q_f32_x4)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st1))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -14299,7 +14299,7 @@ vst2_s8_(a as _, b.0, b.1, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -14332,7 +14332,7 @@ vst2_s16_(a as _, b.0, b.1, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -14365,7 +14365,7 @@ vst2_s32_(a as _, b.0, b.1, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -14398,7 +14398,7 @@ vst2q_s8_(a as _, b.0, b.1, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -14431,7 +14431,7 @@ vst2q_s16_(a as _, b.0, b.1, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -14464,7 +14464,7 @@ vst2q_s32_(a as _, b.0, b.1, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -14497,7 +14497,7 @@ vst2_s64_(a as _, b.0, b.1, 8)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_s64)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(nop))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -14686,7 +14686,7 @@ vst2_f32_(a as _, b.0, b.1, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -14719,7 +14719,7 @@ vst2q_f32_(a as _, b.0, b.1, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st2))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -14754,7 +14754,7 @@ vst2_lane_s8_(a as _, b.0, b.1, LANE, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st2, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -14791,7 +14791,7 @@ vst2_lane_s16_(a as _, b.0, b.1, LANE, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st2, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -14828,7 +14828,7 @@ vst2_lane_s32_(a as _, b.0, b.1, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st2, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -14865,7 +14865,7 @@ vst2q_lane_s16_(a as _, b.0, b.1, LANE, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st2, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -14902,7 +14902,7 @@ vst2q_lane_s32_(a as _, b.0, b.1, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st2, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -15059,7 +15059,7 @@ vst2_lane_f32_(a as _, b.0, b.1, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2_lane_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st2, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -15096,7 +15096,7 @@ vst2q_lane_f32_(a as _, b.0, b.1, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst2q_lane_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st2, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -15131,7 +15131,7 @@ vst3_s8_(a as _, b.0, b.1, b.2, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st3))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -15164,7 +15164,7 @@ vst3_s16_(a as _, b.0, b.1, b.2, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st3))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -15197,7 +15197,7 @@ vst3_s32_(a as _, b.0, b.1, b.2, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st3))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -15230,7 +15230,7 @@ vst3q_s8_(a as _, b.0, b.1, b.2, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st3))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -15263,7 +15263,7 @@ vst3q_s16_(a as _, b.0, b.1, b.2, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st3))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -15296,7 +15296,7 @@ vst3q_s32_(a as _, b.0, b.1, b.2, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st3))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -15329,7 +15329,7 @@ vst3_s64_(a as _, b.0, b.1, b.2, 8)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_s64)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(nop))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -15518,7 +15518,7 @@ vst3_f32_(a as _, b.0, b.1, b.2, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st3))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -15551,7 +15551,7 @@ vst3q_f32_(a as _, b.0, b.1, b.2, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st3))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -15586,7 +15586,7 @@ vst3_lane_s8_(a as _, b.0, b.1, b.2, LANE, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st3, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -15623,7 +15623,7 @@ vst3_lane_s16_(a as _, b.0, b.1, b.2, LANE, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st3, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -15660,7 +15660,7 @@ vst3_lane_s32_(a as _, b.0, b.1, b.2, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st3, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -15697,7 +15697,7 @@ vst3q_lane_s16_(a as _, b.0, b.1, b.2, LANE, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st3, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -15734,7 +15734,7 @@ vst3q_lane_s32_(a as _, b.0, b.1, b.2, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st3, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -15891,7 +15891,7 @@ vst3_lane_f32_(a as _, b.0, b.1, b.2, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3_lane_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st3, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -15928,7 +15928,7 @@ vst3q_lane_f32_(a as _, b.0, b.1, b.2, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst3q_lane_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st3, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -15963,7 +15963,7 @@ vst4_s8_(a as _, b.0, b.1, b.2, b.3, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st4))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -15996,7 +15996,7 @@ vst4_s16_(a as _, b.0, b.1, b.2, b.3, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st4))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -16029,7 +16029,7 @@ vst4_s32_(a as _, b.0, b.1, b.2, b.3, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st4))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -16062,7 +16062,7 @@ vst4q_s8_(a as _, b.0, b.1, b.2, b.3, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st4))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -16095,7 +16095,7 @@ vst4q_s16_(a as _, b.0, b.1, b.2, b.3, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st4))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -16128,7 +16128,7 @@ vst4q_s32_(a as _, b.0, b.1, b.2, b.3, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st4))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -16161,7 +16161,7 @@ vst4_s64_(a as _, b.0, b.1, b.2, b.3, 8)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_s64)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(nop))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -16350,7 +16350,7 @@ vst4_f32_(a as _, b.0, b.1, b.2, b.3, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st4))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -16383,7 +16383,7 @@ vst4q_f32_(a as _, b.0, b.1, b.2, b.3, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st4))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
@@ -16418,7 +16418,7 @@ vst4_lane_s8_(a as _, b.0, b.1, b.2, b.3, LANE, 1)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st4, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -16455,7 +16455,7 @@ vst4_lane_s16_(a as _, b.0, b.1, b.2, b.3, LANE, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st4, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -16492,7 +16492,7 @@ vst4_lane_s32_(a as _, b.0, b.1, b.2, b.3, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st4, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -16529,7 +16529,7 @@ vst4q_lane_s16_(a as _, b.0, b.1, b.2, b.3, LANE, 2)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st4, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -16566,7 +16566,7 @@ vst4q_lane_s32_(a as _, b.0, b.1, b.2, b.3, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st4, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -16723,7 +16723,7 @@ vst4_lane_f32_(a as _, b.0, b.1, b.2, b.3, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4_lane_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st4, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -16760,7 +16760,7 @@ vst4q_lane_f32_(a as _, b.0, b.1, b.2, b.3, LANE, 4)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vst4q_lane_f32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(st4, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
@@ -16775,6 +16775,106 @@ pub unsafe fn vst4q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4x4_t) {
 vst4q_lane_f32_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
 }
 
+/// Dot product vector form with unsigned and signed integers
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_s32)
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot))]
+pub unsafe fn vusdot_s32(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.usdot.v2i32.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.usdot.v2i32.v8i8")]
+        fn vusdot_s32_(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t;
+    }
+vusdot_s32_(a, b, c)
+}
+
+/// Dot product vector form with unsigned and signed integers
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_s32)
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot))]
+pub unsafe fn vusdotq_s32(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.usdot.v4i32.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.usdot.v4i32.v16i8")]
+        fn vusdotq_s32_(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t;
+    }
+vusdotq_s32_(a, b, c)
+}
+
+/// Dot product index form with unsigned and signed integers
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_lane_s32)
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vusdot_lane_s32<const LANE: i32>(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    let c: int32x2_t = transmute(c);
+    let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+    vusdot_s32(a, b, transmute(c))
+}
+
+/// Dot product index form with unsigned and signed integers
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_lane_s32)
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vusdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: uint8x16_t, c: int8x8_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    let c: int32x2_t = transmute(c);
+    let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vusdotq_s32(a, b, transmute(c))
+}
+
+/// Dot product index form with signed and unsigned integers
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudot_lane_s32)
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsudot, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sudot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vsudot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: uint8x8_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    let c: uint32x2_t = transmute(c);
+    let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+    vusdot_s32(a, transmute(c), b)
+}
+
+/// Dot product index form with signed and unsigned integers
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudotq_lane_s32)
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsudot, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sudot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vsudotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: uint8x8_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    let c: uint32x2_t = transmute(c);
+    let c: uint32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vusdotq_s32(a, transmute(c), b)
+}
+
 /// Multiply
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_s8)
@@ -18737,6 +18837,142 @@ pub unsafe fn vsubl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
     simd_sub(c, d)
 }
 
+/// Dot product arithmetic (vector)
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_s32)
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sdot))]
+pub unsafe fn vdot_s32(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sdot.v2i32.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v2i32.v8i8")]
+        fn vdot_s32_(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t;
+    }
+vdot_s32_(a, b, c)
+}
+
+/// Dot product arithmetic (vector)
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_s32)
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sdot))]
+pub unsafe fn vdotq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sdot.v4i32.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v4i32.v16i8")]
+        fn vdotq_s32_(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t;
+    }
+vdotq_s32_(a, b, c)
+}
+
+/// Dot product arithmetic (vector)
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_u32)
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(udot))]
+pub unsafe fn vdot_u32(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.udot.v2i32.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v2i32.v8i8")]
+        fn vdot_u32_(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t;
+    }
+vdot_u32_(a, b, c)
+}
+
+/// Dot product arithmetic (vector)
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_u32)
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(udot))]
+pub unsafe fn vdotq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.udot.v4i32.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v4i32.v16i8")]
+        fn vdotq_u32_(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t;
+    }
+vdotq_u32_(a, b, c)
+}
+
+/// Dot product arithmetic (indexed)
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_s32)
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sdot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vdot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    let c: int32x2_t = transmute(c);
+    let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+    vdot_s32(a, b, transmute(c))
+}
+
+/// Dot product arithmetic (indexed)
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_s32)
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sdot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int8x8_t) -> int32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    let c: int32x2_t = transmute(c);
+    let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vdotq_s32(a, b, transmute(c))
+}
+
+/// Dot product arithmetic (indexed)
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_u32)
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(udot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vdot_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t {
+    static_assert_uimm_bits!(LANE, 1);
+    let c: uint32x2_t = transmute(c);
+    let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
+    vdot_u32(a, b, transmute(c))
+}
+
+/// Dot product arithmetic (indexed)
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_u32)
+#[inline]
+#[target_feature(enable = "neon,dotprod")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot, LANE = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(udot, LANE = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vdotq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: uint8x8_t) -> uint32x4_t {
+    static_assert_uimm_bits!(LANE, 1);
+    let c: uint32x2_t = transmute(c);
+    let c: uint32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
+    vdotq_u32(a, b, transmute(c))
+}
+
 /// Maximum (vector)
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmax_s8)
@@ -20569,7 +20805,7 @@ vqrshrn_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i1
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqrshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -20606,7 +20842,7 @@ vqrshrn_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqrshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -20643,7 +20879,7 @@ vqrshrn_n_s64_(a, int64x2_t(-N as i64, -N as i64))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_s64)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqrshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -20680,7 +20916,7 @@ vqrshrn_n_u16_(a, uint16x8_t(-N as u16, -N as u16, -N as u16, -N as u16, -N as u
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_u16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqrshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -20717,7 +20953,7 @@ vqrshrn_n_u32_(a, uint32x4_t(-N as u32, -N as u32, -N as u32, -N as u32))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_u32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqrshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -20754,7 +20990,7 @@ vqrshrn_n_u64_(a, uint64x2_t(-N as u64, -N as u64))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrn_n_u64)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqrshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -20791,7 +21027,7 @@ vqrshrun_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrun_n_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqrshrun, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -20828,7 +21064,7 @@ vqrshrun_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrun_n_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqrshrun, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -20865,7 +21101,7 @@ vqrshrun_n_s64_(a, int64x2_t(-N as i64, -N as i64))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqrshrun_n_s64)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqrshrun, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -21446,7 +21682,7 @@ vqshlu_n_s8_(a, int8x8_t(N as i8, N as i8, N as i8, N as i8, N as i8, N as i8, N
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlu_n_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqshlu, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -21483,7 +21719,7 @@ vqshlu_n_s16_(a, int16x4_t(N as i16, N as i16, N as i16, N as i16))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlu_n_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqshlu, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -21520,7 +21756,7 @@ vqshlu_n_s32_(a, int32x2_t(N as i32, N as i32))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlu_n_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqshlu, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -21557,7 +21793,7 @@ vqshlu_n_s64_(a, int64x1_t(N as i64))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshlu_n_s64)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqshlu, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -21594,7 +21830,7 @@ vqshluq_n_s8_(a, int8x16_t(N as i8, N as i8, N as i8, N as i8, N as i8, N as i8,
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshluq_n_s8)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqshlu, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -21631,7 +21867,7 @@ vqshluq_n_s16_(a, int16x8_t(N as i16, N as i16, N as i16, N as i16, N as i16, N
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshluq_n_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqshlu, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -21668,7 +21904,7 @@ vqshluq_n_s32_(a, int32x4_t(N as i32, N as i32, N as i32, N as i32))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshluq_n_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqshlu, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -21705,7 +21941,7 @@ vqshluq_n_s64_(a, int64x2_t(N as i64, N as i64))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshluq_n_s64)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqshlu, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -21742,7 +21978,7 @@ vqshrn_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -21779,7 +22015,7 @@ vqshrn_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -21816,7 +22052,7 @@ vqshrn_n_s64_(a, int64x2_t(-N as i64, -N as i64))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_s64)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -21853,7 +22089,7 @@ vqshrn_n_u16_(a, uint16x8_t(-N as u16, -N as u16, -N as u16, -N as u16, -N as u1
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_u16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -21890,7 +22126,7 @@ vqshrn_n_u32_(a, uint32x4_t(-N as u32, -N as u32, -N as u32, -N as u32))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_u32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -21927,7 +22163,7 @@ vqshrn_n_u64_(a, uint64x2_t(-N as u64, -N as u64))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrn_n_u64)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -21964,7 +22200,7 @@ vqshrun_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i1
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrun_n_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqshrun, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -22001,7 +22237,7 @@ vqshrun_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrun_n_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqshrun, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -22038,7 +22274,7 @@ vqshrun_n_s64_(a, int64x2_t(-N as i64, -N as i64))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vqshrun_n_s64)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqshrun, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -26185,7 +26421,7 @@ vrshlq_u64_(a, b)
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vrshr_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
     static_assert!(N >= 1 && N <= 8);
-    vrshl_s8(a, vdup_n_s8((-N) as _))
+    vrshl_s8(a, vdup_n_s8(-N as _))
 }
 
 /// Signed rounding shift right
@@ -26200,7 +26436,7 @@ pub unsafe fn vrshr_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vrshrq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
     static_assert!(N >= 1 && N <= 8);
-    vrshlq_s8(a, vdupq_n_s8((-N) as _))
+    vrshlq_s8(a, vdupq_n_s8(-N as _))
 }
 
 /// Signed rounding shift right
@@ -26215,7 +26451,7 @@ pub unsafe fn vrshrq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vrshr_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
     static_assert!(N >= 1 && N <= 16);
-    vrshl_s16(a, vdup_n_s16((-N) as _))
+    vrshl_s16(a, vdup_n_s16(-N as _))
 }
 
 /// Signed rounding shift right
@@ -26230,7 +26466,7 @@ pub unsafe fn vrshr_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vrshrq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
     static_assert!(N >= 1 && N <= 16);
-    vrshlq_s16(a, vdupq_n_s16((-N) as _))
+    vrshlq_s16(a, vdupq_n_s16(-N as _))
 }
 
 /// Signed rounding shift right
@@ -26245,7 +26481,7 @@ pub unsafe fn vrshrq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vrshr_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
     static_assert!(N >= 1 && N <= 32);
-    vrshl_s32(a, vdup_n_s32((-N) as _))
+    vrshl_s32(a, vdup_n_s32(-N as _))
 }
 
 /// Signed rounding shift right
@@ -26260,7 +26496,7 @@ pub unsafe fn vrshr_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vrshrq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
     static_assert!(N >= 1 && N <= 32);
-    vrshlq_s32(a, vdupq_n_s32((-N) as _))
+    vrshlq_s32(a, vdupq_n_s32(-N as _))
 }
 
 /// Signed rounding shift right
@@ -26275,7 +26511,7 @@ pub unsafe fn vrshrq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vrshr_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
     static_assert!(N >= 1 && N <= 64);
-    vrshl_s64(a, vdup_n_s64((-N) as _))
+    vrshl_s64(a, vdup_n_s64(-N as _))
 }
 
 /// Signed rounding shift right
@@ -26290,7 +26526,7 @@ pub unsafe fn vrshr_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vrshrq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
     static_assert!(N >= 1 && N <= 64);
-    vrshlq_s64(a, vdupq_n_s64((-N) as _))
+    vrshlq_s64(a, vdupq_n_s64(-N as _))
 }
 
 /// Unsigned rounding shift right
@@ -26305,7 +26541,7 @@ pub unsafe fn vrshrq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vrshr_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
     static_assert!(N >= 1 && N <= 8);
-    vrshl_u8(a, vdup_n_s8((-N) as _))
+    vrshl_u8(a, vdup_n_s8(-N as _))
 }
 
 /// Unsigned rounding shift right
@@ -26320,7 +26556,7 @@ pub unsafe fn vrshr_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vrshrq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
     static_assert!(N >= 1 && N <= 8);
-    vrshlq_u8(a, vdupq_n_s8((-N) as _))
+    vrshlq_u8(a, vdupq_n_s8(-N as _))
 }
 
 /// Unsigned rounding shift right
@@ -26335,7 +26571,7 @@ pub unsafe fn vrshrq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vrshr_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
     static_assert!(N >= 1 && N <= 16);
-    vrshl_u16(a, vdup_n_s16((-N) as _))
+    vrshl_u16(a, vdup_n_s16(-N as _))
 }
 
 /// Unsigned rounding shift right
@@ -26350,7 +26586,7 @@ pub unsafe fn vrshr_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vrshrq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
     static_assert!(N >= 1 && N <= 16);
-    vrshlq_u16(a, vdupq_n_s16((-N) as _))
+    vrshlq_u16(a, vdupq_n_s16(-N as _))
 }
 
 /// Unsigned rounding shift right
@@ -26365,7 +26601,7 @@ pub unsafe fn vrshrq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vrshr_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
     static_assert!(N >= 1 && N <= 32);
-    vrshl_u32(a, vdup_n_s32((-N) as _))
+    vrshl_u32(a, vdup_n_s32(-N as _))
 }
 
 /// Unsigned rounding shift right
@@ -26380,7 +26616,7 @@ pub unsafe fn vrshr_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vrshrq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
     static_assert!(N >= 1 && N <= 32);
-    vrshlq_u32(a, vdupq_n_s32((-N) as _))
+    vrshlq_u32(a, vdupq_n_s32(-N as _))
 }
 
 /// Unsigned rounding shift right
@@ -26395,7 +26631,7 @@ pub unsafe fn vrshrq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vrshr_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
     static_assert!(N >= 1 && N <= 64);
-    vrshl_u64(a, vdup_n_s64((-N) as _))
+    vrshl_u64(a, vdup_n_s64(-N as _))
 }
 
 /// Unsigned rounding shift right
@@ -26410,7 +26646,7 @@ pub unsafe fn vrshr_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vrshrq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
     static_assert!(N >= 1 && N <= 64);
-    vrshlq_u64(a, vdupq_n_s64((-N) as _))
+    vrshlq_u64(a, vdupq_n_s64(-N as _))
 }
 
 /// Rounding shift right narrow
@@ -26435,7 +26671,7 @@ vrshrn_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_n_s16)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(rshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -26472,7 +26708,7 @@ vrshrn_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_n_s32)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(rshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -26509,7 +26745,7 @@ vrshrn_n_s64_(a, int64x2_t(-N as i64, -N as i64))
 ///
 /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vrshrn_n_s64)
 #[inline]
-#[cfg(target_arch = "aarch64")]
+#[cfg(not(target_arch = "arm"))]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(rshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
@@ -28882,7 +29118,7 @@ pub unsafe fn vzip_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vzipq_s8(a: int8x16_t, b: int8x16_t) -> int8x16x2_t {
     let a0: int8x16_t = simd_shuffle!(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]);
@@ -28897,7 +29133,7 @@ pub unsafe fn vzipq_s8(a: int8x16_t, b: int8x16_t) -> int8x16x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vzipq_s16(a: int16x8_t, b: int16x8_t) -> int16x8x2_t {
     let a0: int16x8_t = simd_shuffle!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]);
@@ -28912,7 +29148,7 @@ pub unsafe fn vzipq_s16(a: int16x8_t, b: int16x8_t) -> int16x8x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vzipq_s32(a: int32x4_t, b: int32x4_t) -> int32x4x2_t {
     let a0: int32x4_t = simd_shuffle!(a, b, [0, 4, 1, 5]);
@@ -28927,7 +29163,7 @@ pub unsafe fn vzipq_s32(a: int32x4_t, b: int32x4_t) -> int32x4x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vzipq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16x2_t {
     let a0: uint8x16_t = simd_shuffle!(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]);
@@ -28942,7 +29178,7 @@ pub unsafe fn vzipq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vzipq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8x2_t {
     let a0: uint16x8_t = simd_shuffle!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]);
@@ -28957,7 +29193,7 @@ pub unsafe fn vzipq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vzipq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4x2_t {
     let a0: uint32x4_t = simd_shuffle!(a, b, [0, 4, 1, 5]);
@@ -28972,7 +29208,7 @@ pub unsafe fn vzipq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vzipq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16x2_t {
     let a0: poly8x16_t = simd_shuffle!(a, b, [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]);
@@ -28987,7 +29223,7 @@ pub unsafe fn vzipq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vzipq_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8x2_t {
     let a0: poly16x8_t = simd_shuffle!(a, b, [0, 8, 1, 9, 2, 10, 3, 11]);
@@ -29017,7 +29253,7 @@ pub unsafe fn vzip_f32(a: float32x2_t, b: float32x2_t) -> float32x2x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vorr))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ext))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(zip))]
 #[cfg_attr(not(target_arch = "arm"), stable(feature = "neon_intrinsics", since = "1.59.0"))]
 pub unsafe fn vzipq_f32(a: float32x4_t, b: float32x4_t) -> float32x4x2_t {
     let a0: float32x4_t = simd_shuffle!(a, b, [0, 4, 1, 5]);
@@ -37823,6 +38059,94 @@ mod test {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "neon,i8mm")]
+    unsafe fn test_vusdot_s32() {
+        let a: i32x2 = i32x2::new(1000, -4200);
+        let b: u8x8 = u8x8::new(100, 205, 110, 195, 120, 185, 130, 175);
+        let c: i8x8 = i8x8::new(0, 1, 2, 3, -1, -2, -3, -4);
+        let e: i32x2 = i32x2::new(2010, -5780);
+        let r: i32x2 = transmute(vusdot_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,i8mm")]
+    unsafe fn test_vusdotq_s32() {
+        let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000);
+        let b: u8x16 = u8x16::new(100, 205, 110, 195, 120, 185, 130, 175, 140, 165, 150, 155, 160, 145, 170, 135);
+        let c: i8x16 = i8x16::new(0, 1, 2, 3, -1, -2, -3, -4, 4, 5, 6, 7, -5, -6, -7, -8);
+        let e: i32x4 = i32x4::new(2010, -5780, 2370, -1940);
+        let r: i32x4 = transmute(vusdotq_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,i8mm")]
+    unsafe fn test_vusdot_lane_s32() {
+        let a: i32x2 = i32x2::new(1000, -4200);
+        let b: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
+        let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
+        let e: i32x2 = i32x2::new(2100, -2700);
+        let r: i32x2 = transmute(vusdot_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+
+        let a: i32x2 = i32x2::new(1000, -4200);
+        let b: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
+        let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
+        let e: i32x2 = i32x2::new(260, -5180);
+        let r: i32x2 = transmute(vusdot_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,i8mm")]
+    unsafe fn test_vusdotq_lane_s32() {
+        let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000);
+        let b: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250);
+        let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
+        let e: i32x4 = i32x4::new(2100, -2700, 900, 4300);
+        let r: i32x4 = transmute(vusdotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+
+        let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000);
+        let b: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250);
+        let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
+        let e: i32x4 = i32x4::new(260, -5180, -2220, 540);
+        let r: i32x4 = transmute(vusdotq_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,i8mm")]
+    unsafe fn test_vsudot_lane_s32() {
+        let a: i32x2 = i32x2::new(-2000, 4200);
+        let b: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
+        let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
+        let e: i32x2 = i32x2::new(-900, 3460);
+        let r: i32x2 = transmute(vsudot_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+
+        let a: i32x2 = i32x2::new(-2000, 4200);
+        let b: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
+        let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
+        let e: i32x2 = i32x2::new(-500, 3220);
+        let r: i32x2 = transmute(vsudot_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,i8mm")]
+    unsafe fn test_vsudotq_lane_s32() {
+        let a: i32x4 = i32x4::new(-2000, 4200, -1000, 2000);
+        let b: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
+        let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
+        let e: i32x4 = i32x4::new(-900, 3460, -3580, -2420);
+        let r: i32x4 = transmute(vsudotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+
+        let a: i32x4 = i32x4::new(-2000, 4200, -1000, 2000);
+        let b: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
+        let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
+        let e: i32x4 = i32x4::new(-500, 3220, -4460, -3940);
+        let r: i32x4 = transmute(vsudotq_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "neon")]
     unsafe fn test_vmul_s8() {
         let a: i8x8 = i8x8::new(1, 2, 1, 2, 1, 2, 1, 2);
@@ -39051,6 +39375,86 @@ mod test {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "neon,dotprod")]
+    unsafe fn test_vdot_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i32x2 = i32x2::new(31, 176);
+        let r: i32x2 = transmute(vdot_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,dotprod")]
+    unsafe fn test_vdotq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 1, 2);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i32x4 = i32x4::new(31, 176, 31, 176);
+        let r: i32x4 = transmute(vdotq_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,dotprod")]
+    unsafe fn test_vdot_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u32x2 = u32x2::new(31, 176);
+        let r: u32x2 = transmute(vdot_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,dotprod")]
+    unsafe fn test_vdotq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 1, 2);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u32x4 = u32x4::new(31, 176, 31, 176);
+        let r: u32x4 = transmute(vdotq_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,dotprod")]
+    unsafe fn test_vdot_lane_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i8x8 = i8x8::new(-1, 2, 3, 4, 5, 6, 7, 8);
+        let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i32x2 = i32x2::new(29, 72);
+        let r: i32x2 = transmute(vdot_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,dotprod")]
+    unsafe fn test_vdotq_lane_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 1, 2);
+        let b: i8x16 = i8x16::new(-1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: i32x4 = i32x4::new(29, 72, 31, 72);
+        let r: i32x4 = transmute(vdotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,dotprod")]
+    unsafe fn test_vdot_lane_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u8x8 = u8x8::new(255, 2, 3, 4, 5, 6, 7, 8);
+        let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u32x2 = u32x2::new(285, 72);
+        let r: u32x2 = transmute(vdot_lane_u32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,dotprod")]
+    unsafe fn test_vdotq_lane_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 1, 2);
+        let b: u8x16 = u8x16::new(255, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
+        let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u32x4 = u32x4::new(285, 72, 31, 72);
+        let r: u32x4 = transmute(vdotq_lane_u32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "neon")]
     unsafe fn test_vmax_s8() {
         let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/registers/aarch32.rs b/library/stdarch/crates/core_arch/src/arm_shared/registers/aarch32.rs
deleted file mode 100644
index e0b71218a..000000000
--- a/library/stdarch/crates/core_arch/src/arm_shared/registers/aarch32.rs
+++ /dev/null
@@ -1,9 +0,0 @@
-/// Application Program Status Register
-pub struct APSR;
-
-// Note (@Lokathor): Because this breaks the use of Rust on the Game Boy
-// Advance, this change must be reverted until Rust learns to handle cpu state
-// properly. See also: https://github.com/rust-lang/stdarch/issues/702
-
-//#[cfg(any(not(target_feature = "thumb-state"), target_feature = "v6t2"))]
-//rsr!(APSR);
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/registers/mod.rs b/library/stdarch/crates/core_arch/src/arm_shared/registers/mod.rs
deleted file mode 100644
index 621efe2f5..000000000
--- a/library/stdarch/crates/core_arch/src/arm_shared/registers/mod.rs
+++ /dev/null
@@ -1,121 +0,0 @@
-#[allow(unused_macros)]
-macro_rules! rsr {
-    ($R:ident) => {
-        impl super::super::sealed::Rsr for $R {
-            unsafe fn __rsr(&self) -> u32 {
-                let r: u32;
-                crate::arch::asm!(concat!("mrs {},", stringify!($R)), out(reg) r, options(nomem, nostack));
-                r
-            }
-        }
-    };
-}
-
-#[allow(unused_macros)]
-macro_rules! rsrp {
-    ($R:ident) => {
-        impl super::super::sealed::Rsrp for $R {
-            unsafe fn __rsrp(&self) -> *const u8 {
-                let r: *const u8;
-                crate::arch::asm!(concat!("mrs {},", stringify!($R)), out(reg) r, options(nomem, nostack));
-                r
-            }
-        }
-    };
-}
-
-#[allow(unused_macros)]
-macro_rules! wsr {
-    ($R:ident) => {
-        impl super::super::sealed::Wsr for $R {
-            unsafe fn __wsr(&self, value: u32) {
-                crate::arch::asm!(concat!("msr ", stringify!($R), ", {}"), in(reg) value, options(nomem, nostack));
-            }
-        }
-    };
-}
-
-#[allow(unused_macros)]
-macro_rules! wsrp {
-    ($R:ident) => {
-        impl super::super::sealed::Wsrp for $R {
-            unsafe fn __wsrp(&self, value: *const u8) {
-                crate::arch::asm!(concat!("msr ", stringify!($R), ", {}"), in(reg) value, options(nomem, nostack));
-            }
-        }
-    };
-}
-
-#[cfg(target_feature = "mclass")]
-mod v6m;
-
-#[cfg(target_feature = "mclass")]
-pub use self::v6m::*;
-
-#[cfg(all(target_feature = "v7", target_feature = "mclass"))]
-mod v7m;
-
-#[cfg(all(target_feature = "v7", target_feature = "mclass"))]
-pub use self::v7m::*;
-
-#[cfg(not(target_arch = "aarch64"))]
-mod aarch32;
-
-#[cfg(not(target_arch = "aarch64"))]
-pub use self::aarch32::*;
-
-/// Reads a 32-bit system register
-#[inline(always)]
-pub unsafe fn __rsr<R>(reg: R) -> u32
-where
-    R: super::sealed::Rsr,
-{
-    reg.__rsr()
-}
-
-/// Reads a 64-bit system register
-#[cfg(target_arch = "aarch64")]
-#[inline(always)]
-pub unsafe fn __rsr64<R>(reg: R) -> u64
-where
-    R: super::sealed::Rsr64,
-{
-    reg.__rsr64()
-}
-
-/// Reads a system register containing an address
-#[inline(always)]
-pub unsafe fn __rsrp<R>(reg: R) -> *const u8
-where
-    R: super::sealed::Rsrp,
-{
-    reg.__rsrp()
-}
-
-/// Writes a 32-bit system register
-#[inline(always)]
-pub unsafe fn __wsr<R>(reg: R, value: u32)
-where
-    R: super::sealed::Wsr,
-{
-    reg.__wsr(value)
-}
-
-/// Writes a 64-bit system register
-#[cfg(target_arch = "aarch64")]
-#[inline(always)]
-pub unsafe fn __wsr64<R>(reg: R, value: u64)
-where
-    R: super::sealed::Wsr64,
-{
-    reg.__wsr64(value)
-}
-
-/// Writes a system register containing an address
-#[inline(always)]
-pub unsafe fn __wsrp<R>(reg: R, value: *const u8)
-where
-    R: super::sealed::Wsrp,
-{
-    reg.__wsrp(value)
-}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/registers/v6m.rs b/library/stdarch/crates/core_arch/src/arm_shared/registers/v6m.rs
deleted file mode 100644
index 7acc63b6d..000000000
--- a/library/stdarch/crates/core_arch/src/arm_shared/registers/v6m.rs
+++ /dev/null
@@ -1,39 +0,0 @@
-/// CONTROL register
-pub struct CONTROL;
-
-rsr!(CONTROL);
-wsr!(CONTROL);
-
-/// Execution Program Status Register
-pub struct EPSR;
-
-rsr!(EPSR);
-
-/// Interrupt Program Status Register
-pub struct IPSR;
-
-rsr!(IPSR);
-
-/// Main Stack Pointer
-pub struct MSP;
-
-rsrp!(MSP);
-wsrp!(MSP);
-
-/// Priority Mask Register
-pub struct PRIMASK;
-
-rsr!(PRIMASK);
-wsr!(PRIMASK);
-
-/// Process Stack Pointer
-pub struct PSP;
-
-rsrp!(PSP);
-wsrp!(PSP);
-
-/// Program Status Register
-#[allow(non_camel_case_types)]
-pub struct xPSR;
-
-rsr!(xPSR);
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/registers/v7m.rs b/library/stdarch/crates/core_arch/src/arm_shared/registers/v7m.rs
deleted file mode 100644
index d1b1d474f..000000000
--- a/library/stdarch/crates/core_arch/src/arm_shared/registers/v7m.rs
+++ /dev/null
@@ -1,17 +0,0 @@
-/// Base Priority Mask Register
-pub struct BASEPRI;
-
-rsr!(BASEPRI);
-wsr!(BASEPRI);
-
-/// Base Priority Mask Register (conditional write)
-#[allow(non_camel_case_types)]
-pub struct BASEPRI_MAX;
-
-wsr!(BASEPRI_MAX);
-
-/// Fault Mask Register
-pub struct FAULTMASK;
-
-rsr!(FAULTMASK);
-wsr!(FAULTMASK);
diff --git a/library/stdarch/crates/core_arch/src/lib.rs b/library/stdarch/crates/core_arch/src/lib.rs
index 023947b83..27dad8e24 100644
--- a/library/stdarch/crates/core_arch/src/lib.rs
+++ b/library/stdarch/crates/core_arch/src/lib.rs
@@ -2,6 +2,7 @@
 #![allow(improper_ctypes_definitions)]
 #![allow(dead_code)]
 #![allow(unused_features)]
+#![allow(internal_features)]
 #![deny(rust_2018_idioms)]
 #![feature(
     custom_inner_attributes,
@@ -12,6 +13,7 @@
     proc_macro_hygiene,
     stmt_expr_attributes,
     core_intrinsics,
+    intrinsics,
     no_core,
     rustc_attrs,
     stdsimd,
diff --git a/library/stdarch/crates/core_arch/src/mod.rs b/library/stdarch/crates/core_arch/src/mod.rs
index 12a5b086c..ad3ec863d 100644
--- a/library/stdarch/crates/core_arch/src/mod.rs
+++ b/library/stdarch/crates/core_arch/src/mod.rs
@@ -64,8 +64,9 @@ pub mod arch {
     /// See the [module documentation](../index.html) for more details.
     #[cfg(any(target_arch = "riscv32", doc))]
     #[doc(cfg(any(target_arch = "riscv32")))]
-    #[unstable(feature = "stdsimd", issue = "27731")]
+    #[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
     pub mod riscv32 {
+        pub use crate::core_arch::riscv32::*;
         pub use crate::core_arch::riscv_shared::*;
     }
 
@@ -74,7 +75,7 @@ pub mod arch {
     /// See the [module documentation](../index.html) for more details.
     #[cfg(any(target_arch = "riscv64", doc))]
     #[doc(cfg(any(target_arch = "riscv64")))]
-    #[unstable(feature = "stdsimd", issue = "27731")]
+    #[unstable(feature = "riscv_ext_intrinsics", issue = "114544")]
     pub mod riscv64 {
         pub use crate::core_arch::riscv64::*;
         // RISC-V RV64 supports all RV32 instructions as well in current specifications (2022-01-05).
@@ -279,6 +280,10 @@ mod aarch64;
 #[doc(cfg(any(target_arch = "arm")))]
 mod arm;
 
+#[cfg(any(target_arch = "riscv32", doc))]
+#[doc(cfg(any(target_arch = "riscv32")))]
+mod riscv32;
+
 #[cfg(any(target_arch = "riscv64", doc))]
 #[doc(cfg(any(target_arch = "riscv64")))]
 mod riscv64;
diff --git a/library/stdarch/crates/core_arch/src/powerpc/altivec.rs b/library/stdarch/crates/core_arch/src/powerpc/altivec.rs
index ae10377ce..e94afa77d 100644
--- a/library/stdarch/crates/core_arch/src/powerpc/altivec.rs
+++ b/library/stdarch/crates/core_arch/src/powerpc/altivec.rs
@@ -15,6 +15,7 @@
 
 use crate::{
     core_arch::{simd::*, simd_llvm::*},
+    mem,
     mem::transmute,
 };
 
@@ -318,6 +319,12 @@ extern "C" {
     fn vupkhsh(a: vector_signed_short) -> vector_signed_int;
     #[link_name = "llvm.ppc.altivec.vupklsh"]
     fn vupklsh(a: vector_signed_short) -> vector_signed_int;
+
+    #[link_name = "llvm.ppc.altivec.mfvscr"]
+    fn mfvscr() -> vector_unsigned_short;
+
+    #[link_name = "llvm.ppc.altivec.vlogefp"]
+    fn vlogefp(a: vector_float) -> vector_float;
 }
 
 macro_rules! s_t_l {
@@ -528,6 +535,60 @@ mod sealed {
 
     impl_vec_lde! { vec_lde_f32 lvewx f32 }
 
+    pub trait VectorXl {
+        type Result;
+        unsafe fn vec_xl(self, a: isize) -> Self::Result;
+    }
+
+    macro_rules! impl_vec_xl {
+        ($fun:ident $notpwr9:ident / $pwr9:ident $ty:ident) => {
+            #[inline]
+            #[target_feature(enable = "altivec")]
+            #[cfg_attr(
+                all(test, not(target_feature = "power9-altivec")),
+                assert_instr($notpwr9)
+            )]
+            #[cfg_attr(all(test, target_feature = "power9-altivec"), assert_instr($pwr9))]
+            pub unsafe fn $fun(a: isize, b: *const $ty) -> t_t_l!($ty) {
+                let addr = (b as *const u8).offset(a);
+
+                // Workaround ptr::copy_nonoverlapping not being inlined
+                extern "rust-intrinsic" {
+                    #[rustc_const_stable(feature = "const_intrinsic_copy", since = "1.63.0")]
+                    #[rustc_nounwind]
+                    pub fn copy_nonoverlapping<T>(src: *const T, dst: *mut T, count: usize);
+                }
+
+                let mut r = mem::MaybeUninit::uninit();
+
+                copy_nonoverlapping(
+                    addr,
+                    r.as_mut_ptr() as *mut u8,
+                    mem::size_of::<t_t_l!($ty)>(),
+                );
+
+                r.assume_init()
+            }
+
+            impl VectorXl for *const $ty {
+                type Result = t_t_l!($ty);
+                #[inline]
+                #[target_feature(enable = "altivec")]
+                unsafe fn vec_xl(self, a: isize) -> Self::Result {
+                    $fun(a, self)
+                }
+            }
+        };
+    }
+
+    impl_vec_xl! { vec_xl_i8 lxvd2x / lxv i8 }
+    impl_vec_xl! { vec_xl_u8 lxvd2x / lxv u8 }
+    impl_vec_xl! { vec_xl_i16 lxvd2x / lxv i16 }
+    impl_vec_xl! { vec_xl_u16 lxvd2x / lxv u16 }
+    impl_vec_xl! { vec_xl_i32 lxvd2x / lxv i32 }
+    impl_vec_xl! { vec_xl_u32 lxvd2x / lxv u32 }
+    impl_vec_xl! { vec_xl_f32 lxvd2x / lxv f32 }
+
     test_impl! { vec_floor(a: vector_float) -> vector_float [ vfloor, vrfim / xvrspim ] }
 
     test_impl! { vec_vexptefp(a: vector_float) -> vector_float [ vexptefp, vexptefp ] }
@@ -2501,6 +2562,24 @@ where
     p.vec_lde(off)
 }
 
+/// VSX Unaligned Load
+#[inline]
+#[target_feature(enable = "altivec")]
+pub unsafe fn vec_xl<T>(off: isize, p: T) -> <T as sealed::VectorXl>::Result
+where
+    T: sealed::VectorXl,
+{
+    p.vec_xl(off)
+}
+
+/// Vector Base-2 Logarithm Estimate
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr(vlogefp))]
+pub unsafe fn vec_loge(a: vector_float) -> vector_float {
+    vlogefp(a)
+}
+
 /// Vector floor.
 #[inline]
 #[target_feature(enable = "altivec")]
@@ -2566,7 +2645,7 @@ pub unsafe fn vec_cmpb(a: vector_float, b: vector_float) -> vector_signed_int {
     sealed::vec_vcmpbfp(a, b)
 }
 
-/// Vector cmpb.
+/// Vector ceil.
 #[inline]
 #[target_feature(enable = "altivec")]
 pub unsafe fn vec_ceil(a: vector_float) -> vector_float {
@@ -2737,6 +2816,14 @@ where
     a.vec_max(b)
 }
 
+/// Move From Vector Status and Control Register.
+#[inline]
+#[target_feature(enable = "altivec")]
+#[cfg_attr(test, assert_instr(mfvscr))]
+pub unsafe fn vec_mfvscr() -> vector_unsigned_short {
+    mfvscr()
+}
+
 /// Vector add.
 #[inline]
 #[target_feature(enable = "altivec")]
@@ -3281,6 +3368,24 @@ mod tests {
     }
 
     #[simd_test(enable = "altivec")]
+    unsafe fn test_vec_xl() {
+        let pat = [
+            u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
+            u8x16::new(
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            ),
+        ];
+
+        for off in 0..16 {
+            let val: u8x16 = transmute(vec_xl(0, (pat.as_ptr() as *const u8).offset(off)));
+            for i in 0..16 {
+                let v = val.extract(i);
+                assert_eq!(off as usize + i, v as usize);
+            }
+        }
+    }
+
+    #[simd_test(enable = "altivec")]
     unsafe fn test_vec_ldl() {
         let pat = [
             u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
diff --git a/library/stdarch/crates/core_arch/src/riscv32/mod.rs b/library/stdarch/crates/core_arch/src/riscv32/mod.rs
new file mode 100644
index 000000000..0a8634c85
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/riscv32/mod.rs
@@ -0,0 +1,5 @@
+//! RISC-V RV32 specific intrinsics
+
+mod zk;
+
+pub use zk::*;
diff --git a/library/stdarch/crates/core_arch/src/riscv32/zk.rs b/library/stdarch/crates/core_arch/src/riscv32/zk.rs
new file mode 100644
index 000000000..376757772
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/riscv32/zk.rs
@@ -0,0 +1,367 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+extern "unadjusted" {
+    #[link_name = "llvm.riscv.aes32esi"]
+    fn _aes32esi(rs1: i32, rs2: i32, bs: i32) -> i32;
+
+    #[link_name = "llvm.riscv.aes32esmi"]
+    fn _aes32esmi(rs1: i32, rs2: i32, bs: i32) -> i32;
+
+    #[link_name = "llvm.riscv.aes32dsi"]
+    fn _aes32dsi(rs1: i32, rs2: i32, bs: i32) -> i32;
+
+    #[link_name = "llvm.riscv.aes32dsmi"]
+    fn _aes32dsmi(rs1: i32, rs2: i32, bs: i32) -> i32;
+
+    #[link_name = "llvm.riscv.zip.i32"]
+    fn _zip(rs1: i32) -> i32;
+
+    #[link_name = "llvm.riscv.unzip.i32"]
+    fn _unzip(rs1: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha512sig0h"]
+    fn _sha512sig0h(rs1: i32, rs2: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha512sig0l"]
+    fn _sha512sig0l(rs1: i32, rs2: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha512sig1h"]
+    fn _sha512sig1h(rs1: i32, rs2: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha512sig1l"]
+    fn _sha512sig1l(rs1: i32, rs2: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha512sum0r"]
+    fn _sha512sum0r(rs1: i32, rs2: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha512sum1r"]
+    fn _sha512sum1r(rs1: i32, rs2: i32) -> i32;
+}
+
+/// AES final round encryption instruction for RV32.
+///
+/// This instruction sources a single byte from rs2 according to bs. To this it applies the
+/// forward AES SBox operation, before XOR’ing the result with rs1. This instruction must
+/// always be implemented such that its execution latency does not depend on the data being
+/// operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.3
+///
+/// # Note
+///
+/// The `BS` parameter is expected to be a constant value and only the bottom 2 bits of `bs` are
+/// used.
+///
+/// # Safety
+///
+/// This function is safe to use if the `zkne` target feature is present.
+#[target_feature(enable = "zkne")]
+#[rustc_legacy_const_generics(2)]
+// See #1464
+// #[cfg_attr(test, assert_instr(aes32esi, BS = 0))]
+#[inline]
+pub unsafe fn aes32esi<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
+    static_assert!(BS < 4);
+
+    _aes32esi(rs1 as i32, rs2 as i32, BS as i32) as u32
+}
+
+/// AES middle round encryption instruction for RV32 with.
+///
+/// This instruction sources a single byte from rs2 according to bs. To this it applies the
+/// forward AES SBox operation, and a partial forward MixColumn, before XOR’ing the result with
+/// rs1. This instruction must always be implemented such that its execution latency does not
+/// depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.4
+///
+/// # Note
+///
+/// The `bs` parameter is expected to be a constant value and only the bottom 2 bits of `bs` are
+/// used.
+///
+/// # Safety
+///
+/// This function is safe to use if the `zkne` target feature is present.
+#[target_feature(enable = "zkne")]
+#[rustc_legacy_const_generics(2)]
+// See #1464
+// #[cfg_attr(test, assert_instr(aes32esmi, BS = 0))]
+#[inline]
+pub unsafe fn aes32esmi<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
+    static_assert!(BS < 4);
+
+    _aes32esmi(rs1 as i32, rs2 as i32, BS as i32) as u32
+}
+
+/// AES final round decryption instruction for RV32.
+///
+/// This instruction sources a single byte from rs2 according to bs. To this it applies the
+/// inverse AES SBox operation, and XOR’s the result with rs1. This instruction must always be
+/// implemented such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.1
+///
+/// # Note
+///
+/// The `BS` parameter is expected to be a constant value and only the bottom 2 bits of `bs` are
+/// used.
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknd` target feature is present.
+#[target_feature(enable = "zknd")]
+#[rustc_legacy_const_generics(2)]
+// See #1464
+// #[cfg_attr(test, assert_instr(aes32dsi, BS = 0))]
+#[inline]
+pub unsafe fn aes32dsi<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
+    static_assert!(BS < 4);
+
+    _aes32dsi(rs1 as i32, rs2 as i32, BS as i32) as u32
+}
+
+/// AES middle round decryption instruction for RV32.
+///
+/// This instruction sources a single byte from rs2 according to bs. To this it applies the
+/// inverse AES SBox operation, and a partial inverse MixColumn, before XOR’ing the result with
+/// rs1. This instruction must always be implemented such that its execution latency does not
+/// depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.2
+///
+/// # Note
+///
+/// The `BS` parameter is expected to be a constant value and only the bottom 2 bits of `bs` are
+/// used.
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknd` target feature is present.
+#[target_feature(enable = "zknd")]
+#[rustc_legacy_const_generics(2)]
+// See #1464
+// #[cfg_attr(test, assert_instr(aes32dsmi, BS = 0))]
+#[inline]
+pub unsafe fn aes32dsmi<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
+    static_assert!(BS < 4);
+
+    _aes32dsmi(rs1 as i32, rs2 as i32, BS as i32) as u32
+}
+
+/// Place upper/lower halves of the source register into odd/even bits of the destination
+/// respectivley.
+///
+/// This instruction places bits in the low half of the source register into the even bit
+/// positions of the destination, and bits in the high half of the source register into the odd
+/// bit positions of the destination. It is the inverse of the unzip instruction. This
+/// instruction is available only on RV32.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.49
+///
+/// # Safety
+///
+/// This function is safe to use if the `zbkb` target feature is present.
+#[target_feature(enable = "zbkb")]
+// See #1464
+// #[cfg_attr(test, assert_instr(zip))]
+#[inline]
+pub unsafe fn zip(rs: u32) -> u32 {
+    _zip(rs as i32) as u32
+}
+
+/// Place odd and even bits of the source word into upper/lower halves of the destination.
+///
+/// This instruction places the even bits of the source register into the low half of the
+/// destination, and the odd bits of the source into the high bits of the destination. It is
+/// the inverse of the zip instruction. This instruction is available only on RV32.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.45
+///
+/// # Safety
+///
+/// This function is safe to use if the `zbkb` target feature is present.
+#[target_feature(enable = "zbkb")]
+#[cfg_attr(test, assert_instr(unzip))]
+#[inline]
+pub unsafe fn unzip(rs: u32) -> u32 {
+    _unzip(rs as i32) as u32
+}
+
+/// Implements the high half of the Sigma0 transformation, as used in the SHA2-512 hash
+/// function \[49\] (Section 4.1.3).
+///
+/// This instruction is implemented on RV32 only. Used to compute the Sigma0 transform of the
+/// SHA2-512 hash function in conjunction with the sha512sig0l instruction. The transform is a
+/// 64-bit to 64-bit function, so the input and output are each represented by two 32-bit
+/// registers. This instruction must always be implemented such that its execution latency does
+/// not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.31
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sha512sig0h))]
+#[inline]
+pub unsafe fn sha512sig0h(rs1: u32, rs2: u32) -> u32 {
+    _sha512sig0h(rs1 as i32, rs2 as i32) as u32
+}
+
+/// Implements the low half of the Sigma0 transformation, as used in the SHA2-512 hash function
+/// \[49\] (Section 4.1.3).
+///
+/// This instruction is implemented on RV32 only. Used to compute the Sigma0 transform of the
+/// SHA2-512 hash function in conjunction with the sha512sig0h instruction. The transform is a
+/// 64-bit to 64-bit function, so the input and output are each represented by two 32-bit
+/// registers. This instruction must always be implemented such that its execution latency does
+/// not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.32
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sha512sig0l))]
+#[inline]
+pub unsafe fn sha512sig0l(rs1: u32, rs2: u32) -> u32 {
+    _sha512sig0l(rs1 as i32, rs2 as i32) as u32
+}
+
+/// Implements the high half of the Sigma1 transformation, as used in the SHA2-512 hash
+/// function \[49\] (Section 4.1.3).
+///
+/// This instruction is implemented on RV32 only. Used to compute the Sigma1 transform of the
+/// SHA2-512 hash function in conjunction with the sha512sig1l instruction. The transform is a
+/// 64-bit to 64-bit function, so the input and output are each represented by two 32-bit
+/// registers. This instruction must always be implemented such that its execution latency does
+/// not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.33
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sha512sig1h))]
+#[inline]
+pub unsafe fn sha512sig1h(rs1: u32, rs2: u32) -> u32 {
+    _sha512sig1h(rs1 as i32, rs2 as i32) as u32
+}
+
+/// Implements the low half of the Sigma1 transformation, as used in the SHA2-512 hash function
+/// \[49\] (Section 4.1.3).
+///
+/// This instruction is implemented on RV32 only. Used to compute the Sigma1 transform of the
+/// SHA2-512 hash function in conjunction with the sha512sig1h instruction. The transform is a
+/// 64-bit to 64-bit function, so the input and output are each represented by two 32-bit
+/// registers. This instruction must always be implemented such that its execution latency does
+/// not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.34
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+#[cfg_attr(test, assert_instr(sha512sig1l))]
+#[inline]
+pub unsafe fn sha512sig1l(rs1: u32, rs2: u32) -> u32 {
+    _sha512sig1l(rs1 as i32, rs2 as i32) as u32
+}
+
+/// Implements the Sum0 transformation, as used in the SHA2-512 hash function \[49\] (Section
+/// 4.1.3).
+///
+/// This instruction is implemented on RV32 only. Used to compute the Sum0 transform of the
+/// SHA2-512 hash function. The transform is a 64-bit to 64-bit function, so the input and
+/// output is represented by two 32-bit registers. This instruction must always be implemented
+/// such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.35
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sha512sum0r))]
+#[inline]
+pub unsafe fn sha512sum0r(rs1: u32, rs2: u32) -> u32 {
+    _sha512sum0r(rs1 as i32, rs2 as i32) as u32
+}
+
+/// Implements the Sum1 transformation, as used in the SHA2-512 hash function \[49\] (Section
+/// 4.1.3).
+///
+/// This instruction is implemented on RV32 only. Used to compute the Sum1 transform of the
+/// SHA2-512 hash function. The transform is a 64-bit to 64-bit function, so the input and
+/// output is represented by two 32-bit registers. This instruction must always be implemented
+/// such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.36
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sha512sum1r))]
+#[inline]
+pub unsafe fn sha512sum1r(rs1: u32, rs2: u32) -> u32 {
+    _sha512sum1r(rs1 as i32, rs2 as i32) as u32
+}
diff --git a/library/stdarch/crates/core_arch/src/riscv64/mod.rs b/library/stdarch/crates/core_arch/src/riscv64/mod.rs
index 751b9a860..ad16d6c23 100644
--- a/library/stdarch/crates/core_arch/src/riscv64/mod.rs
+++ b/library/stdarch/crates/core_arch/src/riscv64/mod.rs
@@ -1,6 +1,10 @@
 //! RISC-V RV64 specific intrinsics
 use crate::arch::asm;
 
+mod zk;
+
+pub use zk::*;
+
 /// Loads virtual machine memory by unsigned word integer
 ///
 /// This instruction performs an explicit memory access as though `V=1`;
diff --git a/library/stdarch/crates/core_arch/src/riscv64/zk.rs b/library/stdarch/crates/core_arch/src/riscv64/zk.rs
new file mode 100644
index 000000000..3dbe3705d
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/riscv64/zk.rs
@@ -0,0 +1,281 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+extern "unadjusted" {
+    #[link_name = "llvm.riscv.aes64es"]
+    fn _aes64es(rs1: i64, rs2: i64) -> i64;
+
+    #[link_name = "llvm.riscv.aes64esm"]
+    fn _aes64esm(rs1: i64, rs2: i64) -> i64;
+
+    #[link_name = "llvm.riscv.aes64ds"]
+    fn _aes64ds(rs1: i64, rs2: i64) -> i64;
+
+    #[link_name = "llvm.riscv.aes64dsm"]
+    fn _aes64dsm(rs1: i64, rs2: i64) -> i64;
+
+    #[link_name = "llvm.riscv.aes64ks1i"]
+    fn _aes64ks1i(rs1: i64, rnum: i32) -> i64;
+
+    #[link_name = "llvm.riscv.aes64ks2"]
+    fn _aes64ks2(rs1: i64, rs2: i64) -> i64;
+
+    #[link_name = "llvm.riscv.sha512sig0"]
+    fn _sha512sig0(rs1: i64) -> i64;
+
+    #[link_name = "llvm.riscv.sha512sig1"]
+    fn _sha512sig1(rs1: i64) -> i64;
+
+    #[link_name = "llvm.riscv.sha512sum0"]
+    fn _sha512sum0(rs1: i64) -> i64;
+
+    #[link_name = "llvm.riscv.sha512sum1"]
+    fn _sha512sum1(rs1: i64) -> i64;
+}
+
+/// AES final round encryption instruction for RV64.
+///
+/// Uses the two 64-bit source registers to represent the entire AES state, and produces half
+/// of the next round output, applying the ShiftRows and SubBytes steps. This instruction must
+/// always be implemented such that its execution latency does not depend on the data being
+/// operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.7
+///
+/// # Safety
+///
+/// This function is safe to use if the `zkne` target feature is present.
+#[target_feature(enable = "zkne")]
+// See #1464
+// #[cfg_attr(test, assert_instr(aes64es))]
+#[inline]
+pub unsafe fn aes64es(rs1: u64, rs2: u64) -> u64 {
+    _aes64es(rs1 as i64, rs2 as i64) as u64
+}
+
+/// AES middle round encryption instruction for RV64.
+///
+/// Uses the two 64-bit source registers to represent the entire AES state, and produces half
+/// of the next round output, applying the ShiftRows, SubBytes and MixColumns steps. This
+/// instruction must always be implemented such that its execution latency does not depend on
+/// the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.8
+///
+/// # Safety
+///
+/// This function is safe to use if the `zkne` target feature is present.
+#[target_feature(enable = "zkne")]
+// See #1464
+// #[cfg_attr(test, assert_instr(aes64esm))]
+#[inline]
+pub unsafe fn aes64esm(rs1: u64, rs2: u64) -> u64 {
+    _aes64esm(rs1 as i64, rs2 as i64) as u64
+}
+
+/// AES final round decryption instruction for RV64.
+///
+/// Uses the two 64-bit source registers to represent the entire AES state, and produces half
+/// of the next round output, applying the Inverse ShiftRows and SubBytes steps. This
+/// instruction must always be implemented such that its execution latency does not depend on
+/// the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.5
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknd` target feature is present.
+#[target_feature(enable = "zknd")]
+// See #1464
+// #[cfg_attr(test, assert_instr(aes64ds))]
+#[inline]
+pub unsafe fn aes64ds(rs1: u64, rs2: u64) -> u64 {
+    _aes64ds(rs1 as i64, rs2 as i64) as u64
+}
+
+/// AES middle round decryption instruction for RV64.
+///
+/// Uses the two 64-bit source registers to represent the entire AES state, and produces half
+/// of the next round output, applying the Inverse ShiftRows, SubBytes and MixColumns steps.
+/// This instruction must always be implemented such that its execution latency does not depend
+/// on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.6
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknd` target feature is present.
+#[target_feature(enable = "zknd")]
+// See #1464
+// #[cfg_attr(test, assert_instr(aes64dsm))]
+#[inline]
+pub unsafe fn aes64dsm(rs1: u64, rs2: u64) -> u64 {
+    _aes64dsm(rs1 as i64, rs2 as i64) as u64
+}
+
+/// This instruction implements part of the KeySchedule operation for the AES Block cipher
+/// involving the SBox operation.
+///
+/// This instruction implements the rotation, SubBytes and Round Constant addition steps of the
+/// AES block cipher Key Schedule. This instruction must always be implemented such that its
+/// execution latency does not depend on the data being operated on. Note that rnum must be in
+/// the range 0x0..0xA. The values 0xB..0xF are reserved.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.10
+///
+/// # Note
+///
+/// The `RNUM` parameter is expected to be a constant value inside the range of `0..=10`.
+///
+/// # Safety
+///
+/// This function is safe to use if the `zkne` or `zknd` target feature is present.
+#[target_feature(enable = "zkne", enable = "zknd")]
+#[rustc_legacy_const_generics(1)]
+// See #1464
+// #[cfg_attr(test, assert_instr(aes64ks1i, RNUM = 0))]
+#[inline]
+pub unsafe fn aes64ks1i<const RNUM: u8>(rs1: u64) -> u64 {
+    static_assert!(RNUM <= 10);
+
+    _aes64ks1i(rs1 as i64, RNUM as i32) as u64
+}
+
+/// This instruction implements part of the KeySchedule operation for the AES Block cipher.
+///
+/// This instruction implements the additional XOR’ing of key words as part of the AES block
+/// cipher Key Schedule. This instruction must always be implemented such that its execution
+/// latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.11
+///
+/// # Safety
+///
+/// This function is safe to use if the `zkne` or `zknd` target feature is present.
+#[target_feature(enable = "zkne", enable = "zknd")]
+// See #1464
+// #[cfg_attr(test, assert_instr(aes64ks2))]
+#[inline]
+pub unsafe fn aes64ks2(rs1: u64, rs2: u64) -> u64 {
+    _aes64ks2(rs1 as i64, rs2 as i64) as u64
+}
+
+/// Implements the Sigma0 transformation function as used in the SHA2-512 hash function \[49\]
+/// (Section 4.1.3).
+///
+/// This instruction is supported for the RV64 base architecture. It implements the Sigma0
+/// transform of the SHA2-512 hash function. \[49\]. This instruction must always be
+/// implemented such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.37
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sha512sig0))]
+#[inline]
+pub unsafe fn sha512sig0(rs1: u64) -> u64 {
+    _sha512sig0(rs1 as i64) as u64
+}
+
+/// Implements the Sigma1 transformation function as used in the SHA2-512 hash function \[49\]
+/// (Section 4.1.3).
+///
+/// This instruction is supported for the RV64 base architecture. It implements the Sigma1
+/// transform of the SHA2-512 hash function. \[49\]. This instruction must always be
+/// implemented such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.38
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sha512sig1))]
+#[inline]
+pub unsafe fn sha512sig1(rs1: u64) -> u64 {
+    _sha512sig1(rs1 as i64) as u64
+}
+
+/// Implements the Sum0 transformation function as used in the SHA2-512 hash function \[49\]
+/// (Section 4.1.3).
+///
+/// This instruction is supported for the RV64 base architecture. It implements the Sum0
+/// transform of the SHA2-512 hash function. \[49\]. This instruction must always be
+/// implemented such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.39
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sha512sum0))]
+#[inline]
+pub unsafe fn sha512sum0(rs1: u64) -> u64 {
+    _sha512sum0(rs1 as i64) as u64
+}
+
+/// Implements the Sum1 transformation function as used in the SHA2-512 hash function \[49\]
+/// (Section 4.1.3).
+///
+/// This instruction is supported for the RV64 base architecture. It implements the Sum1
+/// transform of the SHA2-512 hash function. \[49\]. This instruction must always be
+/// implemented such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.40
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sha512sum1))]
+#[inline]
+pub unsafe fn sha512sum1(rs1: u64) -> u64 {
+    _sha512sum1(rs1 as i64) as u64
+}
diff --git a/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs b/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs
index ed021df5a..14f6989d2 100644
--- a/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs
+++ b/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs
@@ -1,7 +1,13 @@
 //! Shared RISC-V intrinsics
+
 mod p;
+mod zb;
+mod zk;
 
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub use p::*;
+pub use zb::*;
+pub use zk::*;
 
 use crate::arch::asm;
 
@@ -10,6 +16,7 @@ use crate::arch::asm;
 /// The PAUSE instruction is a HINT that indicates the current hart's rate of instruction retirement
 /// should be temporarily reduced or paused. The duration of its effect must be bounded and may be zero.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub fn pause() {
     unsafe { asm!(".insn i 0x0F, 0, x0, x0, 0x010", options(nomem, nostack)) }
 }
@@ -19,6 +26,7 @@ pub fn pause() {
 /// The NOP instruction does not change any architecturally visible state, except for
 /// advancing the `pc` and incrementing any applicable performance counters.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub fn nop() {
     unsafe { asm!("nop", options(nomem, nostack)) }
 }
@@ -29,6 +37,7 @@ pub fn nop() {
 /// until an interrupt might need servicing. This instruction is a hint,
 /// and a legal implementation is to simply implement WFI as a NOP.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn wfi() {
     asm!("wfi", options(nomem, nostack))
 }
@@ -41,6 +50,7 @@ pub unsafe fn wfi() {
 /// FENCE.I does not ensure that other RISC-V harts' instruction fetches will observe the
 /// local hart's stores in a multiprocessor system.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn fence_i() {
     asm!("fence.i", options(nostack))
 }
@@ -54,6 +64,7 @@ pub unsafe fn fence_i() {
 /// virtual address in parameter `vaddr` and that match the address space identified by integer
 /// parameter `asid`, except for entries containing global mappings.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn sfence_vma(vaddr: usize, asid: usize) {
     asm!("sfence.vma {}, {}", in(reg) vaddr, in(reg) asid, options(nostack))
 }
@@ -65,6 +76,7 @@ pub unsafe fn sfence_vma(vaddr: usize, asid: usize) {
 /// The fence also invalidates all address-translation cache entries that contain leaf page
 /// table entries corresponding to the virtual address in parameter `vaddr`, for all address spaces.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn sfence_vma_vaddr(vaddr: usize) {
     asm!("sfence.vma {}, x0", in(reg) vaddr, options(nostack))
 }
@@ -78,6 +90,7 @@ pub unsafe fn sfence_vma_vaddr(vaddr: usize) {
 /// address-translation cache entries matching the address space identified by integer
 /// parameter `asid`, except for entries containing global mappings.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn sfence_vma_asid(asid: usize) {
     asm!("sfence.vma x0, {}", in(reg) asid, options(nostack))
 }
@@ -88,6 +101,7 @@ pub unsafe fn sfence_vma_asid(asid: usize) {
 /// tables, for all address spaces. The fence also invalidates all address-translation cache entries,
 /// for all address spaces.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn sfence_vma_all() {
     asm!("sfence.vma", options(nostack))
 }
@@ -97,6 +111,7 @@ pub unsafe fn sfence_vma_all() {
 /// This instruction invalidates any address-translation cache entries that an
 /// `SFENCE.VMA` instruction with the same values of `vaddr` and `asid` would invalidate.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn sinval_vma(vaddr: usize, asid: usize) {
     // asm!("sinval.vma {}, {}", in(reg) vaddr, in(reg) asid, options(nostack))
     asm!(".insn r 0x73, 0, 0x0B, x0, {}, {}", in(reg) vaddr, in(reg) asid, options(nostack))
@@ -107,6 +122,7 @@ pub unsafe fn sinval_vma(vaddr: usize, asid: usize) {
 /// This instruction invalidates any address-translation cache entries that an
 /// `SFENCE.VMA` instruction with the same values of `vaddr` and `asid` would invalidate.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn sinval_vma_vaddr(vaddr: usize) {
     asm!(".insn r 0x73, 0, 0x0B, x0, {}, x0", in(reg) vaddr, options(nostack))
 }
@@ -116,6 +132,7 @@ pub unsafe fn sinval_vma_vaddr(vaddr: usize) {
 /// This instruction invalidates any address-translation cache entries that an
 /// `SFENCE.VMA` instruction with the same values of `vaddr` and `asid` would invalidate.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn sinval_vma_asid(asid: usize) {
     asm!(".insn r 0x73, 0, 0x0B, x0, x0, {}", in(reg) asid, options(nostack))
 }
@@ -125,6 +142,7 @@ pub unsafe fn sinval_vma_asid(asid: usize) {
 /// This instruction invalidates any address-translation cache entries that an
 /// `SFENCE.VMA` instruction with the same values of `vaddr` and `asid` would invalidate.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn sinval_vma_all() {
     asm!(".insn r 0x73, 0, 0x0B, x0, x0, x0", options(nostack))
 }
@@ -134,6 +152,7 @@ pub unsafe fn sinval_vma_all() {
 /// This instruction guarantees that any previous stores already visible to the current RISC-V hart
 /// are ordered before subsequent `SINVAL.VMA` instructions executed by the same hart.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn sfence_w_inval() {
     // asm!("sfence.w.inval", options(nostack))
     asm!(".insn i 0x73, 0, x0, x0, 0x180", options(nostack))
@@ -144,6 +163,7 @@ pub unsafe fn sfence_w_inval() {
 /// This instruction guarantees that any previous SINVAL.VMA instructions executed by the current hart
 /// are ordered before subsequent implicit references by that hart to the memory-management data structures.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn sfence_inval_ir() {
     // asm!("sfence.inval.ir", options(nostack))
     asm!(".insn i 0x73, 0, x0, x0, 0x181", options(nostack))
@@ -158,6 +178,7 @@ pub unsafe fn sfence_inval_ir() {
 /// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.B`
 /// instruction which is effectively a dereference to any memory address.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hlv_b(src: *const i8) -> i8 {
     let value: i8;
     asm!(".insn i 0x73, 0x4, {}, {}, 0x600", out(reg) value, in(reg) src, options(readonly, nostack));
@@ -173,6 +194,7 @@ pub unsafe fn hlv_b(src: *const i8) -> i8 {
 /// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.BU`
 /// instruction which is effectively a dereference to any memory address.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hlv_bu(src: *const u8) -> u8 {
     let value: u8;
     asm!(".insn i 0x73, 0x4, {}, {}, 0x601", out(reg) value, in(reg) src, options(readonly, nostack));
@@ -188,6 +210,7 @@ pub unsafe fn hlv_bu(src: *const u8) -> u8 {
 /// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.H`
 /// instruction which is effectively a dereference to any memory address.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hlv_h(src: *const i16) -> i16 {
     let value: i16;
     asm!(".insn i 0x73, 0x4, {}, {}, 0x640", out(reg) value, in(reg) src, options(readonly, nostack));
@@ -203,6 +226,7 @@ pub unsafe fn hlv_h(src: *const i16) -> i16 {
 /// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.HU`
 /// instruction which is effectively a dereference to any memory address.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hlv_hu(src: *const u16) -> u16 {
     let value: u16;
     asm!(".insn i 0x73, 0x4, {}, {}, 0x641", out(reg) value, in(reg) src, options(readonly, nostack));
@@ -218,6 +242,7 @@ pub unsafe fn hlv_hu(src: *const u16) -> u16 {
 /// This function is unsafe for it accesses the virtual supervisor or user via a `HLVX.HU`
 /// instruction which is effectively a dereference to any memory address.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hlvx_hu(src: *const u16) -> u16 {
     let insn: u16;
     asm!(".insn i 0x73, 0x4, {}, {}, 0x643", out(reg) insn, in(reg) src, options(readonly, nostack));
@@ -233,6 +258,7 @@ pub unsafe fn hlvx_hu(src: *const u16) -> u16 {
 /// This function is unsafe for it accesses the virtual supervisor or user via a `HLV.W`
 /// instruction which is effectively a dereference to any memory address.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hlv_w(src: *const i32) -> i32 {
     let value: i32;
     asm!(".insn i 0x73, 0x4, {}, {}, 0x680", out(reg) value, in(reg) src, options(readonly, nostack));
@@ -248,6 +274,7 @@ pub unsafe fn hlv_w(src: *const i32) -> i32 {
 /// This function is unsafe for it accesses the virtual supervisor or user via a `HLVX.WU`
 /// instruction which is effectively a dereference to any memory address.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hlvx_wu(src: *const u32) -> u32 {
     let insn: u32;
     asm!(".insn i 0x73, 0x4, {}, {}, 0x683", out(reg) insn, in(reg) src, options(readonly, nostack));
@@ -263,6 +290,7 @@ pub unsafe fn hlvx_wu(src: *const u32) -> u32 {
 /// This function is unsafe for it accesses the virtual supervisor or user via a `HSV.B`
 /// instruction which is effectively a dereference to any memory address.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hsv_b(dst: *mut i8, src: i8) {
     asm!(".insn r 0x73, 0x4, 0x31, x0, {}, {}", in(reg) dst, in(reg) src, options(nostack));
 }
@@ -276,6 +304,7 @@ pub unsafe fn hsv_b(dst: *mut i8, src: i8) {
 /// This function is unsafe for it accesses the virtual supervisor or user via a `HSV.H`
 /// instruction which is effectively a dereference to any memory address.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hsv_h(dst: *mut i16, src: i16) {
     asm!(".insn r 0x73, 0x4, 0x33, x0, {}, {}", in(reg) dst, in(reg) src, options(nostack));
 }
@@ -289,6 +318,7 @@ pub unsafe fn hsv_h(dst: *mut i16, src: i16) {
 /// This function is unsafe for it accesses the virtual supervisor or user via a `HSV.W`
 /// instruction which is effectively a dereference to any memory address.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hsv_w(dst: *mut i32, src: i32) {
     asm!(".insn r 0x73, 0x4, 0x35, x0, {}, {}", in(reg) dst, in(reg) src, options(nostack));
 }
@@ -302,6 +332,7 @@ pub unsafe fn hsv_w(dst: *mut i32, src: i32) {
 ///
 /// This fence specifies a single guest virtual address, and a single guest address-space identifier.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hfence_vvma(vaddr: usize, asid: usize) {
     // asm!("hfence.vvma {}, {}", in(reg) vaddr, in(reg) asid)
     asm!(".insn r 0x73, 0, 0x11, x0, {}, {}", in(reg) vaddr, in(reg) asid, options(nostack))
@@ -316,6 +347,7 @@ pub unsafe fn hfence_vvma(vaddr: usize, asid: usize) {
 ///
 /// This fence specifies a single guest virtual address.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hfence_vvma_vaddr(vaddr: usize) {
     asm!(".insn r 0x73, 0, 0x11, x0, {}, x0", in(reg) vaddr, options(nostack))
 }
@@ -329,6 +361,7 @@ pub unsafe fn hfence_vvma_vaddr(vaddr: usize) {
 ///
 /// This fence specifies a single guest address-space identifier.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hfence_vvma_asid(asid: usize) {
     asm!(".insn r 0x73, 0, 0x11, x0, x0, {}", in(reg) asid, options(nostack))
 }
@@ -342,6 +375,7 @@ pub unsafe fn hfence_vvma_asid(asid: usize) {
 ///
 /// This fence applies to any guest address spaces and guest virtual addresses.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hfence_vvma_all() {
     asm!(".insn r 0x73, 0, 0x11, x0, x0, x0", options(nostack))
 }
@@ -354,6 +388,7 @@ pub unsafe fn hfence_vvma_all() {
 /// This fence specifies a single guest physical address, **shifted right by 2 bits**, and a single virtual machine
 /// by virtual machine identifier (VMID).
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hfence_gvma(gaddr: usize, vmid: usize) {
     // asm!("hfence.gvma {}, {}", in(reg) gaddr, in(reg) vmid, options(nostack))
     asm!(".insn r 0x73, 0, 0x31, x0, {}, {}", in(reg) gaddr, in(reg) vmid, options(nostack))
@@ -366,6 +401,7 @@ pub unsafe fn hfence_gvma(gaddr: usize, vmid: usize) {
 ///
 /// This fence specifies a single guest physical address; **the physical address should be shifted right by 2 bits**.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hfence_gvma_gaddr(gaddr: usize) {
     asm!(".insn r 0x73, 0, 0x31, x0, {}, x0", in(reg) gaddr, options(nostack))
 }
@@ -377,6 +413,7 @@ pub unsafe fn hfence_gvma_gaddr(gaddr: usize) {
 ///
 /// This fence specifies a single virtual machine by virtual machine identifier (VMID).
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hfence_gvma_vmid(vmid: usize) {
     asm!(".insn r 0x73, 0, 0x31, x0, x0, {}", in(reg) vmid, options(nostack))
 }
@@ -388,6 +425,7 @@ pub unsafe fn hfence_gvma_vmid(vmid: usize) {
 ///
 /// This fence specifies all guest physical addresses and all virtual machines.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hfence_gvma_all() {
     asm!(".insn r 0x73, 0, 0x31, x0, x0, x0", options(nostack))
 }
@@ -399,6 +437,7 @@ pub unsafe fn hfence_gvma_all() {
 ///
 /// This fence specifies a single guest virtual address, and a single guest address-space identifier.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hinval_vvma(vaddr: usize, asid: usize) {
     // asm!("hinval.vvma {}, {}", in(reg) vaddr, in(reg) asid, options(nostack))
     asm!(".insn r 0x73, 0, 0x13, x0, {}, {}", in(reg) vaddr, in(reg) asid, options(nostack))
@@ -411,6 +450,7 @@ pub unsafe fn hinval_vvma(vaddr: usize, asid: usize) {
 ///
 /// This fence specifies a single guest virtual address.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hinval_vvma_vaddr(vaddr: usize) {
     asm!(".insn r 0x73, 0, 0x13, x0, {}, x0", in(reg) vaddr, options(nostack))
 }
@@ -422,6 +462,7 @@ pub unsafe fn hinval_vvma_vaddr(vaddr: usize) {
 ///
 /// This fence specifies a single guest address-space identifier.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hinval_vvma_asid(asid: usize) {
     asm!(".insn r 0x73, 0, 0x13, x0, x0, {}", in(reg) asid, options(nostack))
 }
@@ -433,6 +474,7 @@ pub unsafe fn hinval_vvma_asid(asid: usize) {
 ///
 /// This fence applies to any guest address spaces and guest virtual addresses.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hinval_vvma_all() {
     asm!(".insn r 0x73, 0, 0x13, x0, x0, x0", options(nostack))
 }
@@ -445,6 +487,7 @@ pub unsafe fn hinval_vvma_all() {
 /// This fence specifies a single guest physical address, **shifted right by 2 bits**, and a single virtual machine
 /// by virtual machine identifier (VMID).
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hinval_gvma(gaddr: usize, vmid: usize) {
     // asm!("hinval.gvma {}, {}", in(reg) gaddr, in(reg) vmid, options(nostack))
     asm!(".insn r 0x73, 0, 0x33, x0, {}, {}", in(reg) gaddr, in(reg) vmid, options(nostack))
@@ -457,6 +500,7 @@ pub unsafe fn hinval_gvma(gaddr: usize, vmid: usize) {
 ///
 /// This fence specifies a single guest physical address; **the physical address should be shifted right by 2 bits**.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hinval_gvma_gaddr(gaddr: usize) {
     asm!(".insn r 0x73, 0, 0x33, x0, {}, x0", in(reg) gaddr, options(nostack))
 }
@@ -468,6 +512,7 @@ pub unsafe fn hinval_gvma_gaddr(gaddr: usize) {
 ///
 /// This fence specifies a single virtual machine by virtual machine identifier (VMID).
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hinval_gvma_vmid(vmid: usize) {
     asm!(".insn r 0x73, 0, 0x33, x0, x0, {}", in(reg) vmid, options(nostack))
 }
@@ -479,6 +524,7 @@ pub unsafe fn hinval_gvma_vmid(vmid: usize) {
 ///
 /// This fence specifies all guest physical addresses and all virtual machines.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub unsafe fn hinval_gvma_all() {
     asm!(".insn r 0x73, 0, 0x33, x0, x0, x0", options(nostack))
 }
@@ -502,6 +548,7 @@ pub unsafe fn hinval_gvma_all() {
 /// [`frrm`]: fn.frrm.html
 /// [`frflags`]: fn.frflags.html
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub fn frcsr() -> u32 {
     let value: u32;
     unsafe { asm!("frcsr {}", out(reg) value, options(nomem, nostack)) };
@@ -513,6 +560,7 @@ pub fn frcsr() -> u32 {
 /// This function swaps the value in `fcsr` by copying the original value to be returned,
 /// and then writing a new value obtained from input variable `value` into `fcsr`.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub fn fscsr(value: u32) -> u32 {
     let original: u32;
     unsafe { asm!("fscsr {}, {}", out(reg) original, in(reg) value, options(nomem, nostack)) }
@@ -535,6 +583,7 @@ pub fn fscsr(value: u32) -> u32 {
 /// | 110 |     | _Reserved for future use._ |
 /// | 111 | DYN | In Rounding Mode register, _reserved_. |
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub fn frrm() -> u32 {
     let value: u32;
     unsafe { asm!("frrm {}", out(reg) value, options(nomem, nostack)) };
@@ -547,6 +596,7 @@ pub fn frrm() -> u32 {
 /// and then writing a new value obtained from the three least-significant bits of
 /// input variable `value` into `frm`.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub fn fsrm(value: u32) -> u32 {
     let original: u32;
     unsafe { asm!("fsrm {}, {}", out(reg) original, in(reg) value, options(nomem, nostack)) }
@@ -570,6 +620,7 @@ pub fn fsrm(value: u32) -> u32 {
 /// | 1 | UF | Underflow |
 /// | 0 | NX | Inexact |
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub fn frflags() -> u32 {
     let value: u32;
     unsafe { asm!("frflags {}", out(reg) value, options(nomem, nostack)) };
@@ -582,179 +633,9 @@ pub fn frflags() -> u32 {
 /// and then writing a new value obtained from the five least-significant bits of
 /// input variable `value` into `fflags`.
 #[inline]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub fn fsflags(value: u32) -> u32 {
     let original: u32;
     unsafe { asm!("fsflags {}, {}", out(reg) original, in(reg) value, options(nomem, nostack)) }
     original
 }
-
-/// `P0` transformation function as is used in the SM3 hash algorithm
-///
-/// This function is included in `Zksh` extension. It's defined as:
-///
-/// ```text
-/// P0(X) = X ⊕ (X ≪ 9) ⊕ (X ≪ 17)
-/// ```
-///
-/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
-///
-/// In the SM3 algorithm, the `P0` transformation is used as `E ← P0(TT2)` when the
-/// compression function `CF` uses the intermediate value `TT2` to calculate
-/// the variable `E` in one iteration for subsequent processes.
-///
-/// According to RISC-V Cryptography Extensions, Volume I, the execution latency of
-/// this instruction must always be independent from the data it operates on.
-#[inline]
-#[target_feature(enable = "zksh")]
-pub fn sm3p0(x: u32) -> u32 {
-    let ans: u32;
-    unsafe { asm!("sm3p0 {}, {}", lateout(reg) ans, in(reg) x, options(pure, nomem, nostack)) };
-    ans
-}
-
-/// `P1` transformation function as is used in the SM3 hash algorithm
-///
-/// This function is included in `Zksh` extension. It's defined as:
-///
-/// ```text
-/// P1(X) = X ⊕ (X ≪ 15) ⊕ (X ≪ 23)
-/// ```
-///
-/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
-///
-/// In the SM3 algorithm, the `P1` transformation is used to expand message,
-/// where expanded word `Wj` can be generated from the previous words.
-/// The whole process can be described as the following pseudocode:
-///
-/// ```text
-/// FOR j=16 TO 67
-///     Wj ← P1(Wj−16 ⊕ Wj−9 ⊕ (Wj−3 ≪ 15)) ⊕ (Wj−13 ≪ 7) ⊕ Wj−6
-/// ENDFOR
-/// ```
-///
-/// According to RISC-V Cryptography Extensions, Volume I, the execution latency of
-/// this instruction must always be independent from the data it operates on.
-#[inline]
-#[target_feature(enable = "zksh")]
-pub fn sm3p1(x: u32) -> u32 {
-    let ans: u32;
-    unsafe { asm!("sm3p1 {}, {}", lateout(reg) ans, in(reg) x, options(pure, nomem, nostack)) };
-    ans
-}
-
-/// Accelerates the round function `F` in the SM4 block cipher algorithm
-///
-/// This instruction is included in extension `Zksed`. It's defined as:
-///
-/// ```text
-/// SM4ED(x, a, BS) = x ⊕ T(ai)
-/// ... where
-/// ai = a.bytes[BS]
-/// T(ai) = L(τ(ai))
-/// bi = τ(ai) = SM4-S-Box(ai)
-/// ci = L(bi) = bi ⊕ (bi ≪ 2) ⊕ (bi ≪ 10) ⊕ (bi ≪ 18) ⊕ (bi ≪ 24)
-/// SM4ED = (ci ≪ (BS * 8)) ⊕ x
-/// ```
-///
-/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
-/// As is defined above, `T` is a combined transformation of non linear S-Box transform `τ`
-/// and linear layer transform `L`.
-///
-/// In the SM4 algorithm, the round function `F` is defined as:
-///
-/// ```text
-/// F(x0, x1, x2, x3, rk) = x0 ⊕ T(x1 ⊕ x2 ⊕ x3 ⊕ rk)
-/// ... where
-/// T(A) = L(τ(A))
-/// B = τ(A) = (SM4-S-Box(a0), SM4-S-Box(a1), SM4-S-Box(a2), SM4-S-Box(a3))
-/// C = L(B) = B ⊕ (B ≪ 2) ⊕ (B ≪ 10) ⊕ (B ≪ 18) ⊕ (B ≪ 24)
-/// ```
-///
-/// It can be implemented by `sm4ed` instruction like:
-///
-/// ```no_run
-/// # #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
-/// # fn round_function(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 {
-/// # #[cfg(target_arch = "riscv32")] use core::arch::riscv32::sm4ed;
-/// # #[cfg(target_arch = "riscv64")] use core::arch::riscv64::sm4ed;
-/// let a = x1 ^ x2 ^ x3 ^ rk;
-/// let c0 = sm4ed::<0>(x0, a);
-/// let c1 = sm4ed::<1>(c0, a); // c1 represents c[0..=1], etc.
-/// let c2 = sm4ed::<2>(c1, a);
-/// let c3 = sm4ed::<3>(c2, a);
-/// return c3; // c3 represents c[0..=3]
-/// # }
-/// ```
-///
-/// According to RISC-V Cryptography Extensions, Volume I, the execution latency of
-/// this instruction must always be independent from the data it operates on.
-#[inline]
-#[target_feature(enable = "zksed")]
-pub fn sm4ed<const BS: u8>(x: u32, a: u32) -> u32 {
-    static_assert!(BS <= 3);
-    let ans: u32;
-    unsafe {
-        asm!("sm4ed {}, {}, {}, {}", lateout(reg) ans, in(reg) x, in(reg) a, const BS, options(pure, nomem, nostack))
-    };
-    ans
-}
-
-/// Accelerates the key schedule operation in the SM4 block cipher algorithm
-///
-/// This instruction is included in extension `Zksed`. It's defined as:
-///
-/// ```text
-/// SM4KS(x, k, BS) = x ⊕ T'(ki)
-/// ... where
-/// ki = k.bytes[BS]
-/// T'(ki) = L'(τ(ki))
-/// bi = τ(ki) = SM4-S-Box(ki)
-/// ci = L'(bi) = bi ⊕ (bi ≪ 13) ⊕ (bi ≪ 23)
-/// SM4KS = (ci ≪ (BS * 8)) ⊕ x
-/// ```
-///
-/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
-/// As is defined above, `T'` is a combined transformation of non linear S-Box transform `τ`
-/// and the replaced linear layer transform `L'`.
-///
-/// In the SM4 algorithm, the key schedule is defined as:
-///
-/// ```text
-/// rk[i] = K[i+4] = K[i] ⊕ T'(K[i+1] ⊕ K[i+2] ⊕ K[i+3] ⊕ CK[i])
-/// ... where
-/// K[0..=3] = MK[0..=3] ⊕ FK[0..=3]
-/// T'(K) = L'(τ(K))
-/// B = τ(K) = (SM4-S-Box(k0), SM4-S-Box(k1), SM4-S-Box(k2), SM4-S-Box(k3))
-/// C = L'(B) = B ⊕ (B ≪ 13) ⊕ (B ≪ 23)
-/// ```
-///
-/// where `MK` represents the input 128-bit encryption key,
-/// constants `FK` and `CK` are fixed system configuration constant values defined by the SM4 algorithm.
-/// Hence, the key schedule operation can be implemented by `sm4ks` instruction like:
-///
-/// ```no_run
-/// # #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
-/// # fn key_schedule(k0: u32, k1: u32, k2: u32, k3: u32, ck_i: u32) -> u32 {
-/// # #[cfg(target_arch = "riscv32")] use core::arch::riscv32::sm4ks;
-/// # #[cfg(target_arch = "riscv64")] use core::arch::riscv64::sm4ks;
-/// let k = k1 ^ k2 ^ k3 ^ ck_i;
-/// let c0 = sm4ks::<0>(k0, k);
-/// let c1 = sm4ks::<1>(c0, k); // c1 represents c[0..=1], etc.
-/// let c2 = sm4ks::<2>(c1, k);
-/// let c3 = sm4ks::<3>(c2, k);
-/// return c3; // c3 represents c[0..=3]
-/// # }
-/// ```
-///
-/// According to RISC-V Cryptography Extensions, Volume I, the execution latency of
-/// this instruction must always be independent from the data it operates on.
-#[inline]
-#[target_feature(enable = "zksed")]
-pub fn sm4ks<const BS: u8>(x: u32, k: u32) -> u32 {
-    static_assert!(BS <= 3);
-    let ans: u32;
-    unsafe {
-        asm!("sm4ks {}, {}, {}, {}", lateout(reg) ans, in(reg) x, in(reg) k, const BS, options(pure, nomem, nostack))
-    };
-    ans
-}
diff --git a/library/stdarch/crates/core_arch/src/riscv_shared/zb.rs b/library/stdarch/crates/core_arch/src/riscv_shared/zb.rs
new file mode 100644
index 000000000..cfae6caa5
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/riscv_shared/zb.rs
@@ -0,0 +1,150 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[cfg(target_arch = "riscv32")]
+extern "unadjusted" {
+    #[link_name = "llvm.riscv.orc.b.i32"]
+    fn _orc_b_32(rs: i32) -> i32;
+
+    #[link_name = "llvm.riscv.clmul.i32"]
+    fn _clmul_32(rs1: i32, rs2: i32) -> i32;
+
+    #[link_name = "llvm.riscv.clmulh.i32"]
+    fn _clmulh_32(rs1: i32, rs2: i32) -> i32;
+
+    #[link_name = "llvm.riscv.clmulr.i32"]
+    fn _clmulr_32(rs1: i32, rs2: i32) -> i32;
+}
+
+#[cfg(target_arch = "riscv64")]
+extern "unadjusted" {
+    #[link_name = "llvm.riscv.orc.b.i64"]
+    fn _orc_b_64(rs1: i64) -> i64;
+
+    #[link_name = "llvm.riscv.clmul.i64"]
+    fn _clmul_64(rs1: i64, rs2: i64) -> i64;
+
+    #[link_name = "llvm.riscv.clmulh.i64"]
+    fn _clmulh_64(rs1: i64, rs2: i64) -> i64;
+
+    #[link_name = "llvm.riscv.clmulr.i64"]
+    fn _clmulr_64(rs1: i64, rs2: i64) -> i64;
+}
+
+/// Bitwise OR-Combine, byte granule
+///
+/// Combines the bits within every byte through a reciprocal bitwise logical OR. This sets the bits of each byte in
+/// the result rd to all zeros if no bit within the respective byte of rs is set, or to all ones if any bit within the
+/// respective byte of rs is set.
+///
+/// Source: RISC-V Bit-Manipulation ISA-extensions
+///
+/// Version: v1.0.0
+///
+/// Section: 2.24
+///
+/// # Safety
+///
+/// This function is safe to use if the `zbb` target feature is present.
+#[target_feature(enable = "zbb")]
+// See #1464
+// #[cfg_attr(test, assert_instr(orc.b))]
+#[inline]
+pub unsafe fn orc_b(rs: usize) -> usize {
+    #[cfg(target_arch = "riscv32")]
+    {
+        _orc_b_32(rs as i32) as usize
+    }
+
+    #[cfg(target_arch = "riscv64")]
+    {
+        _orc_b_64(rs as i64) as usize
+    }
+}
+
+/// Carry-less multiply (low-part)
+///
+/// clmul produces the lower half of the 2·XLEN carry-less product.
+///
+/// Source: RISC-V Bit-Manipulation ISA-extensions
+///
+/// Version: v1.0.0
+///
+/// Section: 2.11
+///
+/// # Safety
+///
+/// This function is safe to use if the `zbc` target feature is present.
+#[target_feature(enable = "zbc")]
+// See #1464
+// #[cfg_attr(test, assert_instr(clmul))]
+#[inline]
+pub unsafe fn clmul(rs1: usize, rs2: usize) -> usize {
+    #[cfg(target_arch = "riscv32")]
+    {
+        _clmul_32(rs1 as i32, rs2 as i32) as usize
+    }
+
+    #[cfg(target_arch = "riscv64")]
+    {
+        _clmul_64(rs1 as i64, rs2 as i64) as usize
+    }
+}
+
+/// Carry-less multiply (high-part)
+///
+/// clmulh produces the upper half of the 2·XLEN carry-less product.
+///
+/// Source: RISC-V Bit-Manipulation ISA-extensions
+///
+/// Version: v1.0.0
+///
+/// Section: 2.12
+///
+/// # Safety
+///
+/// This function is safe to use if the `zbc` target feature is present.
+#[target_feature(enable = "zbc")]
+// See #1464
+// #[cfg_attr(test, assert_instr(clmulh))]
+#[inline]
+pub unsafe fn clmulh(rs1: usize, rs2: usize) -> usize {
+    #[cfg(target_arch = "riscv32")]
+    {
+        _clmulh_32(rs1 as i32, rs2 as i32) as usize
+    }
+
+    #[cfg(target_arch = "riscv64")]
+    {
+        _clmulh_64(rs1 as i64, rs2 as i64) as usize
+    }
+}
+
+/// Carry-less multiply (reversed)
+///
+/// clmulr produces bits 2·XLEN−2:XLEN-1 of the 2·XLEN carry-less product.
+///
+/// Source: RISC-V Bit-Manipulation ISA-extensions
+///
+/// Version: v1.0.0
+///
+/// Section: 2.13
+///
+/// # Safety
+///
+/// This function is safe to use if the `zbc` target feature is present.
+#[target_feature(enable = "zbc")]
+// See #1464
+// #[cfg_attr(test, assert_instr(clmulr))]
+#[inline]
+pub unsafe fn clmulr(rs1: usize, rs2: usize) -> usize {
+    #[cfg(target_arch = "riscv32")]
+    {
+        _clmulr_32(rs1 as i32, rs2 as i32) as usize
+    }
+
+    #[cfg(target_arch = "riscv64")]
+    {
+        _clmulr_64(rs1 as i64, rs2 as i64) as usize
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs b/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs
new file mode 100644
index 000000000..db97f72bc
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs
@@ -0,0 +1,462 @@
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+extern "unadjusted" {
+    #[link_name = "llvm.riscv.sm4ed"]
+    fn _sm4ed(rs1: i32, rs2: i32, bs: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sm4ks"]
+    fn _sm4ks(rs1: i32, rs2: i32, bs: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sm3p0"]
+    fn _sm3p0(rs1: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sm3p1"]
+    fn _sm3p1(rs1: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha256sig0"]
+    fn _sha256sig0(rs1: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha256sig1"]
+    fn _sha256sig1(rs1: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha256sum0"]
+    fn _sha256sum0(rs1: i32) -> i32;
+
+    #[link_name = "llvm.riscv.sha256sum1"]
+    fn _sha256sum1(rs1: i32) -> i32;
+}
+
+#[cfg(target_arch = "riscv32")]
+extern "unadjusted" {
+    #[link_name = "llvm.riscv.xperm8.i32"]
+    fn _xperm8_32(rs1: i32, rs2: i32) -> i32;
+
+    #[link_name = "llvm.riscv.xperm4.i32"]
+    fn _xperm4_32(rs1: i32, rs2: i32) -> i32;
+}
+
+#[cfg(target_arch = "riscv64")]
+extern "unadjusted" {
+    #[link_name = "llvm.riscv.xperm8.i64"]
+    fn _xperm8_64(rs1: i64, rs2: i64) -> i64;
+
+    #[link_name = "llvm.riscv.xperm4.i64"]
+    fn _xperm4_64(rs1: i64, rs2: i64) -> i64;
+}
+
+/// Byte-wise lookup of indicies into a vector in registers.
+///
+/// The xperm8 instruction operates on bytes. The rs1 register contains a vector of XLEN/8
+/// 8-bit elements. The rs2 register contains a vector of XLEN/8 8-bit indexes. The result is
+/// each element in rs2 replaced by the indexed element in rs1, or zero if the index into rs2
+/// is out of bounds.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.47
+///
+/// # Safety
+///
+/// This function is safe to use if the `zbkx` target feature is present.
+#[target_feature(enable = "zbkx")]
+// See #1464
+// #[cfg_attr(test, assert_instr(xperm8))]
+#[inline]
+pub unsafe fn xperm8(rs1: usize, rs2: usize) -> usize {
+    #[cfg(target_arch = "riscv32")]
+    {
+        _xperm8_32(rs1 as i32, rs2 as i32) as usize
+    }
+
+    #[cfg(target_arch = "riscv64")]
+    {
+        _xperm8_64(rs1 as i64, rs2 as i64) as usize
+    }
+}
+
+/// Nibble-wise lookup of indicies into a vector.
+///
+/// The xperm4 instruction operates on nibbles. The rs1 register contains a vector of XLEN/4
+/// 4-bit elements. The rs2 register contains a vector of XLEN/4 4-bit indexes. The result is
+/// each element in rs2 replaced by the indexed element in rs1, or zero if the index into rs2
+/// is out of bounds.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.48
+///
+/// # Safety
+///
+/// This function is safe to use if the `zbkx` target feature is present.
+#[target_feature(enable = "zbkx")]
+// See #1464
+// #[cfg_attr(test, assert_instr(xperm4))]
+#[inline]
+pub unsafe fn xperm4(rs1: usize, rs2: usize) -> usize {
+    #[cfg(target_arch = "riscv32")]
+    {
+        _xperm4_32(rs1 as i32, rs2 as i32) as usize
+    }
+
+    #[cfg(target_arch = "riscv64")]
+    {
+        _xperm4_64(rs1 as i64, rs2 as i64) as usize
+    }
+}
+
+/// Implements the Sigma0 transformation function as used in the SHA2-256 hash function \[49\]
+/// (Section 4.1.2).
+///
+/// This instruction is supported for both RV32 and RV64 base architectures. For RV32, the
+/// entire XLEN source register is operated on. For RV64, the low 32 bits of the source
+/// register are operated on, and the result sign extended to XLEN bits. Though named for
+/// SHA2-256, the instruction works for both the SHA2-224 and SHA2-256 parameterisations as
+/// described in \[49\]. This instruction must always be implemented such that its execution
+/// latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.27
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sha256sig0))]
+#[inline]
+pub unsafe fn sha256sig0(rs1: u32) -> u32 {
+    _sha256sig0(rs1 as i32) as u32
+}
+
+/// Implements the Sigma1 transformation function as used in the SHA2-256 hash function \[49\]
+/// (Section 4.1.2).
+///
+/// This instruction is supported for both RV32 and RV64 base architectures. For RV32, the
+/// entire XLEN source register is operated on. For RV64, the low 32 bits of the source
+/// register are operated on, and the result sign extended to XLEN bits. Though named for
+/// SHA2-256, the instruction works for both the SHA2-224 and SHA2-256 parameterisations as
+/// described in \[49\]. This instruction must always be implemented such that its execution
+/// latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.28
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sha256sig1))]
+#[inline]
+pub unsafe fn sha256sig1(rs1: u32) -> u32 {
+    _sha256sig1(rs1 as i32) as u32
+}
+
+/// Implements the Sum0 transformation function as used in the SHA2-256 hash function \[49\]
+/// (Section 4.1.2).
+///
+/// This instruction is supported for both RV32 and RV64 base architectures. For RV32, the
+/// entire XLEN source register is operated on. For RV64, the low 32 bits of the source
+/// register are operated on, and the result sign extended to XLEN bits. Though named for
+/// SHA2-256, the instruction works for both the SHA2-224 and SHA2-256 parameterisations as
+/// described in \[49\]. This instruction must always be implemented such that its execution
+/// latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.29
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sha256sum0))]
+#[inline]
+pub unsafe fn sha256sum0(rs1: u32) -> u32 {
+    _sha256sum0(rs1 as i32) as u32
+}
+
+/// Implements the Sum1 transformation function as used in the SHA2-256 hash function \[49\]
+/// (Section 4.1.2).
+///
+/// This instruction is supported for both RV32 and RV64 base architectures. For RV32, the
+/// entire XLEN source register is operated on. For RV64, the low 32 bits of the source
+/// register are operated on, and the result sign extended to XLEN bits. Though named for
+/// SHA2-256, the instruction works for both the SHA2-224 and SHA2-256 parameterisations as
+/// described in \[49\]. This instruction must always be implemented such that its execution
+/// latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.30
+///
+/// # Safety
+///
+/// This function is safe to use if the `zknh` target feature is present.
+#[target_feature(enable = "zknh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sha256sum1))]
+#[inline]
+pub unsafe fn sha256sum1(rs1: u32) -> u32 {
+    _sha256sum1(rs1 as i32) as u32
+}
+
+/// Accelerates the block encrypt/decrypt operation of the SM4 block cipher \[5, 31\].
+///
+/// Implements a T-tables in hardware style approach to accelerating the SM4 round function. A
+/// byte is extracted from rs2 based on bs, to which the SBox and linear layer transforms are
+/// applied, before the result is XOR’d with rs1 and written back to rd. This instruction
+/// exists on RV32 and RV64 base architectures. On RV64, the 32-bit result is sign extended to
+/// XLEN bits. This instruction must always be implemented such that its execution latency does
+/// not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.43
+///
+/// # Note
+///
+/// The `BS` parameter is expected to be a constant value and only the bottom 2 bits of `bs` are
+/// used.
+///
+/// # Safety
+///
+/// This function is safe to use if the `zksed` target feature is present.
+///
+/// # Details
+///
+/// Accelerates the round function `F` in the SM4 block cipher algorithm
+///
+/// This instruction is included in extension `Zksed`. It's defined as:
+///
+/// ```text
+/// SM4ED(x, a, BS) = x ⊕ T(ai)
+/// ... where
+/// ai = a.bytes[BS]
+/// T(ai) = L(τ(ai))
+/// bi = τ(ai) = SM4-S-Box(ai)
+/// ci = L(bi) = bi ⊕ (bi ≪ 2) ⊕ (bi ≪ 10) ⊕ (bi ≪ 18) ⊕ (bi ≪ 24)
+/// SM4ED = (ci ≪ (BS * 8)) ⊕ x
+/// ```
+///
+/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
+/// As is defined above, `T` is a combined transformation of non linear S-Box transform `τ`
+/// and linear layer transform `L`.
+///
+/// In the SM4 algorithm, the round function `F` is defined as:
+///
+/// ```text
+/// F(x0, x1, x2, x3, rk) = x0 ⊕ T(x1 ⊕ x2 ⊕ x3 ⊕ rk)
+/// ... where
+/// T(A) = L(τ(A))
+/// B = τ(A) = (SM4-S-Box(a0), SM4-S-Box(a1), SM4-S-Box(a2), SM4-S-Box(a3))
+/// C = L(B) = B ⊕ (B ≪ 2) ⊕ (B ≪ 10) ⊕ (B ≪ 18) ⊕ (B ≪ 24)
+/// ```
+///
+/// It can be implemented by `sm4ed` instruction like:
+///
+/// ```no_run
+/// # #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
+/// # fn round_function(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 {
+/// # #[cfg(target_arch = "riscv32")] use core::arch::riscv32::sm4ed;
+/// # #[cfg(target_arch = "riscv64")] use core::arch::riscv64::sm4ed;
+/// let a = x1 ^ x2 ^ x3 ^ rk;
+/// let c0 = sm4ed(x0, a, 0);
+/// let c1 = sm4ed(c0, a, 1); // c1 represents c[0..=1], etc.
+/// let c2 = sm4ed(c1, a, 2);
+/// let c3 = sm4ed(c2, a, 3);
+/// return c3; // c3 represents c[0..=3]
+/// # }
+/// ```
+#[target_feature(enable = "zksed")]
+#[rustc_legacy_const_generics(2)]
+// See #1464
+// #[cfg_attr(test, assert_instr(sm4ed, BS = 0))]
+#[inline]
+pub unsafe fn sm4ed<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
+    static_assert!(BS < 4);
+
+    _sm4ed(rs1 as i32, rs2 as i32, BS as i32) as u32
+}
+
+/// Accelerates the Key Schedule operation of the SM4 block cipher \[5, 31\] with `bs=0`.
+///
+/// Implements a T-tables in hardware style approach to accelerating the SM4 Key Schedule. A
+/// byte is extracted from rs2 based on bs, to which the SBox and linear layer transforms are
+/// applied, before the result is XOR’d with rs1 and written back to rd. This instruction
+/// exists on RV32 and RV64 base architectures. On RV64, the 32-bit result is sign extended to
+/// XLEN bits. This instruction must always be implemented such that its execution latency does
+/// not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.44
+///
+/// # Note
+///
+/// The `BS` parameter is expected to be a constant value and only the bottom 2 bits of `bs` are
+/// used.
+///
+/// # Safety
+///
+/// This function is safe to use if the `zksed` target feature is present.
+///
+/// # Details
+///
+/// Accelerates the round function `F` in the SM4 block cipher algorithm
+///
+/// This instruction is included in extension `Zksed`. It's defined as:
+///
+/// ```text
+/// SM4ED(x, a, BS) = x ⊕ T(ai)
+/// ... where
+/// ai = a.bytes[BS]
+/// T(ai) = L(τ(ai))
+/// bi = τ(ai) = SM4-S-Box(ai)
+/// ci = L(bi) = bi ⊕ (bi ≪ 2) ⊕ (bi ≪ 10) ⊕ (bi ≪ 18) ⊕ (bi ≪ 24)
+/// SM4ED = (ci ≪ (BS * 8)) ⊕ x
+/// ```
+///
+/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
+/// As is defined above, `T` is a combined transformation of non linear S-Box transform `τ`
+/// and linear layer transform `L`.
+///
+/// In the SM4 algorithm, the round function `F` is defined as:
+///
+/// ```text
+/// F(x0, x1, x2, x3, rk) = x0 ⊕ T(x1 ⊕ x2 ⊕ x3 ⊕ rk)
+/// ... where
+/// T(A) = L(τ(A))
+/// B = τ(A) = (SM4-S-Box(a0), SM4-S-Box(a1), SM4-S-Box(a2), SM4-S-Box(a3))
+/// C = L(B) = B ⊕ (B ≪ 2) ⊕ (B ≪ 10) ⊕ (B ≪ 18) ⊕ (B ≪ 24)
+/// ```
+///
+/// It can be implemented by `sm4ed` instruction like:
+///
+/// ```no_run
+/// # #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
+/// # fn round_function(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 {
+/// # #[cfg(target_arch = "riscv32")] use core::arch::riscv32::sm4ed;
+/// # #[cfg(target_arch = "riscv64")] use core::arch::riscv64::sm4ed;
+/// let a = x1 ^ x2 ^ x3 ^ rk;
+/// let c0 = sm4ed(x0, a, 0);
+/// let c1 = sm4ed(c0, a, 1); // c1 represents c[0..=1], etc.
+/// let c2 = sm4ed(c1, a, 2);
+/// let c3 = sm4ed(c2, a, 3);
+/// return c3; // c3 represents c[0..=3]
+/// # }
+/// ```
+#[target_feature(enable = "zksed")]
+#[rustc_legacy_const_generics(2)]
+// See #1464
+// #[cfg_attr(test, assert_instr(sm4ks, BS = 0))]
+#[inline]
+pub unsafe fn sm4ks<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
+    static_assert!(BS < 4);
+
+    _sm4ks(rs1 as i32, rs2 as i32, BS as i32) as u32
+}
+
+/// Implements the P0 transformation function as used in the SM3 hash function [4, 30].
+///
+/// This instruction is supported for the RV32 and RV64 base architectures. It implements the
+/// P0 transform of the SM3 hash function [4, 30]. This instruction must always be implemented
+/// such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.41
+///
+/// # Safety
+///
+/// This function is safe to use if the `zksh` target feature is present.
+///
+/// # Details
+///
+/// `P0` transformation function as is used in the SM3 hash algorithm
+///
+/// This function is included in `Zksh` extension. It's defined as:
+///
+/// ```text
+/// P0(X) = X ⊕ (X ≪ 9) ⊕ (X ≪ 17)
+/// ```
+///
+/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
+///
+/// In the SM3 algorithm, the `P0` transformation is used as `E ← P0(TT2)` when the
+/// compression function `CF` uses the intermediate value `TT2` to calculate
+/// the variable `E` in one iteration for subsequent processes.
+#[target_feature(enable = "zksh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sm3p0))]
+#[inline]
+pub unsafe fn sm3p0(rs1: u32) -> u32 {
+    _sm3p0(rs1 as i32) as u32
+}
+
+/// Implements the P1 transformation function as used in the SM3 hash function [4, 30].
+///
+/// This instruction is supported for the RV32 and RV64 base architectures. It implements the
+/// P1 transform of the SM3 hash function [4, 30]. This instruction must always be implemented
+/// such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.42
+///
+/// # Safety
+///
+/// This function is safe to use if the `zksh` target feature is present.
+///
+/// # Details
+///
+/// `P1` transformation function as is used in the SM3 hash algorithm
+///
+/// This function is included in `Zksh` extension. It's defined as:
+///
+/// ```text
+/// P1(X) = X ⊕ (X ≪ 15) ⊕ (X ≪ 23)
+/// ```
+///
+/// where `⊕` represents 32-bit xor, and `≪ k` represents rotate left by `k` bits.
+///
+/// In the SM3 algorithm, the `P1` transformation is used to expand message,
+/// where expanded word `Wj` can be generated from the previous words.
+/// The whole process can be described as the following pseudocode:
+///
+/// ```text
+/// FOR j=16 TO 67
+///     Wj ← P1(Wj−16 ⊕ Wj−9 ⊕ (Wj−3 ≪ 15)) ⊕ (Wj−13 ≪ 7) ⊕ Wj−6
+/// ENDFOR
+/// ```
+#[target_feature(enable = "zksh")]
+// See #1464
+// #[cfg_attr(test, assert_instr(sm3p1))]
+#[inline]
+pub unsafe fn sm3p1(rs1: u32) -> u32 {
+    _sm3p1(rs1 as i32) as u32
+}
diff --git a/library/stdarch/crates/core_arch/src/wasm32/relaxed_simd.rs b/library/stdarch/crates/core_arch/src/wasm32/relaxed_simd.rs
index 8fe935d1f..403fc79d0 100644
--- a/library/stdarch/crates/core_arch/src/wasm32/relaxed_simd.rs
+++ b/library/stdarch/crates/core_arch/src/wasm32/relaxed_simd.rs
@@ -303,11 +303,11 @@ pub fn i32x4_relaxed_dot_i8x16_i7x16_add(a: v128, b: v128, c: v128) -> v128 {
 }
 
 #[cfg(test)]
-pub mod tests {
+mod tests {
     use super::super::simd128::*;
     use super::*;
     use core::ops::{Add, Div, Mul, Neg, Sub};
-    use std;
+
     use std::fmt::Debug;
     use std::mem::transmute;
     use std::num::Wrapping;
diff --git a/library/stdarch/crates/core_arch/src/wasm32/simd128.rs b/library/stdarch/crates/core_arch/src/wasm32/simd128.rs
index e974d9e56..4819195dc 100644
--- a/library/stdarch/crates/core_arch/src/wasm32/simd128.rs
+++ b/library/stdarch/crates/core_arch/src/wasm32/simd128.rs
@@ -672,7 +672,6 @@ pub unsafe fn v128_store64_lane<const L: usize>(v: v128, m: *mut u64) {
 /// If possible this will generate a `v128.const` instruction, otherwise it may
 /// be lowered to a sequence of instructions to materialize the vector value.
 #[inline]
-#[target_feature(enable = "simd128")]
 #[cfg_attr(
     test,
     assert_instr(
@@ -727,7 +726,6 @@ pub const fn i8x16(
 /// If possible this will generate a `v128.const` instruction, otherwise it may
 /// be lowered to a sequence of instructions to materialize the vector value.
 #[inline]
-#[target_feature(enable = "simd128")]
 #[doc(alias("v128.const"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 #[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
@@ -760,7 +758,6 @@ pub const fn u8x16(
 /// If possible this will generate a `v128.const` instruction, otherwise it may
 /// be lowered to a sequence of instructions to materialize the vector value.
 #[inline]
-#[target_feature(enable = "simd128")]
 #[cfg_attr(
     test,
     assert_instr(
@@ -787,7 +784,6 @@ pub const fn i16x8(a0: i16, a1: i16, a2: i16, a3: i16, a4: i16, a5: i16, a6: i16
 /// If possible this will generate a `v128.const` instruction, otherwise it may
 /// be lowered to a sequence of instructions to materialize the vector value.
 #[inline]
-#[target_feature(enable = "simd128")]
 #[doc(alias("v128.const"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 #[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
@@ -800,7 +796,6 @@ pub const fn u16x8(a0: u16, a1: u16, a2: u16, a3: u16, a4: u16, a5: u16, a6: u16
 /// If possible this will generate a `v128.const` instruction, otherwise it may
 /// be lowered to a sequence of instructions to materialize the vector value.
 #[inline]
-#[target_feature(enable = "simd128")]
 #[cfg_attr(test, assert_instr(v128.const, a0 = 0, a1 = 1, a2 = 2, a3 = 3))]
 #[doc(alias("v128.const"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
@@ -814,7 +809,6 @@ pub const fn i32x4(a0: i32, a1: i32, a2: i32, a3: i32) -> v128 {
 /// If possible this will generate a `v128.const` instruction, otherwise it may
 /// be lowered to a sequence of instructions to materialize the vector value.
 #[inline]
-#[target_feature(enable = "simd128")]
 #[doc(alias("v128.const"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 #[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
@@ -827,7 +821,6 @@ pub const fn u32x4(a0: u32, a1: u32, a2: u32, a3: u32) -> v128 {
 /// If possible this will generate a `v128.const` instruction, otherwise it may
 /// be lowered to a sequence of instructions to materialize the vector value.
 #[inline]
-#[target_feature(enable = "simd128")]
 #[cfg_attr(test, assert_instr(v128.const, a0 = 1, a1 = 2))]
 #[doc(alias("v128.const"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
@@ -841,7 +834,6 @@ pub const fn i64x2(a0: i64, a1: i64) -> v128 {
 /// If possible this will generate a `v128.const` instruction, otherwise it may
 /// be lowered to a sequence of instructions to materialize the vector value.
 #[inline]
-#[target_feature(enable = "simd128")]
 #[doc(alias("v128.const"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 #[rustc_const_stable(feature = "wasm_simd", since = "1.54.0")]
@@ -854,7 +846,6 @@ pub const fn u64x2(a0: u64, a1: u64) -> v128 {
 /// If possible this will generate a `v128.const` instruction, otherwise it may
 /// be lowered to a sequence of instructions to materialize the vector value.
 #[inline]
-#[target_feature(enable = "simd128")]
 #[cfg_attr(test, assert_instr(v128.const, a0 = 0.0, a1 = 1.0, a2 = 2.0, a3 = 3.0))]
 #[doc(alias("v128.const"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
@@ -868,7 +859,6 @@ pub const fn f32x4(a0: f32, a1: f32, a2: f32, a3: f32) -> v128 {
 /// If possible this will generate a `v128.const` instruction, otherwise it may
 /// be lowered to a sequence of instructions to materialize the vector value.
 #[inline]
-#[target_feature(enable = "simd128")]
 #[cfg_attr(test, assert_instr(v128.const, a0 = 0.0, a1 = 1.0))]
 #[doc(alias("v128.const"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
@@ -3212,7 +3202,7 @@ pub fn i32x4_shr(a: v128, amt: u32) -> v128 {
 #[doc(alias("i32x4.shr_u"))]
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn u32x4_shr(a: v128, amt: u32) -> v128 {
-    unsafe { simd_shr(a.as_u32x4(), simd::u32x4::splat(amt as u32)).v128() }
+    unsafe { simd_shr(a.as_u32x4(), simd::u32x4::splat(amt)).v128() }
 }
 
 /// Adds two 128-bit vectors as if they were two packed four 32-bit integers.
@@ -4236,10 +4226,10 @@ pub fn f64x2_promote_low_f32x4(a: v128) -> v128 {
 }
 
 #[cfg(test)]
-pub mod tests {
+mod tests {
     use super::*;
     use core::ops::{Add, Div, Mul, Neg, Sub};
-    use std;
+
     use std::fmt::Debug;
     use std::mem::transmute;
     use std::num::Wrapping;
@@ -4587,8 +4577,8 @@ pub mod tests {
                     u8::MAX.into(),
                 ),
                 i16x8(
-                    i16::MIN.into(),
-                    i16::MAX.into(),
+                    i16::MIN,
+                    i16::MAX,
                     u16::MIN as i16,
                     u16::MAX as i16,
                     0,
@@ -4613,8 +4603,8 @@ pub mod tests {
                     u8::MAX.into(),
                 ),
                 i16x8(
-                    i16::MIN.into(),
-                    i16::MAX.into(),
+                    i16::MIN,
+                    i16::MAX,
                     u16::MIN as i16,
                     u16::MAX as i16,
                     0,
@@ -4634,12 +4624,7 @@ pub mod tests {
         compare_bytes(
             i16x8_narrow_i32x4(
                 i32x4(0, -1, i16::MIN.into(), i16::MAX.into()),
-                i32x4(
-                    i32::MIN.into(),
-                    i32::MAX.into(),
-                    u32::MIN as i32,
-                    u32::MAX as i32,
-                ),
+                i32x4(i32::MIN, i32::MAX, u32::MIN as i32, u32::MAX as i32),
             ),
             i16x8(0, -1, i16::MIN, i16::MAX, i16::MIN, i16::MAX, 0, -1),
         );
@@ -4647,12 +4632,7 @@ pub mod tests {
         compare_bytes(
             u16x8_narrow_i32x4(
                 i32x4(u16::MAX.into(), -1, i16::MIN.into(), i16::MAX.into()),
-                i32x4(
-                    i32::MIN.into(),
-                    i32::MAX.into(),
-                    u32::MIN as i32,
-                    u32::MAX as i32,
-                ),
+                i32x4(i32::MIN, i32::MAX, u32::MIN as i32, u32::MAX as i32),
             ),
             i16x8(-1, 0, 0, i16::MAX, 0, -1, 0, 0),
         );
diff --git a/library/stdarch/crates/core_arch/src/x86/avx.rs b/library/stdarch/crates/core_arch/src/x86/avx.rs
index fafee5c0b..00bcc1fa1 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx.rs
@@ -738,7 +738,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
     static_assert_uimm_bits!(IMM5, 5);
-    vcmppd(a, b, IMM5 as i8)
+    vcmppd(a, b, const { IMM5 as i8 })
 }
 
 /// Compares packed double-precision (64-bit) floating-point
@@ -768,7 +768,7 @@ pub unsafe fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
     static_assert_uimm_bits!(IMM5, 5);
-    vcmpps(a, b, IMM5 as i8)
+    vcmpps(a, b, const { IMM5 as i8 })
 }
 
 /// Compares packed single-precision (32-bit) floating-point
@@ -783,7 +783,7 @@ pub unsafe fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
     static_assert_uimm_bits!(IMM5, 5);
-    vcmpps256(a, b, IMM5 as u8)
+    vcmpps256(a, b, const { IMM5 as u8 })
 }
 
 /// Compares the lower double-precision (64-bit) floating-point element in
@@ -1028,7 +1028,7 @@ pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps)
 #[inline]
 #[target_feature(enable = "avx")]
-#[cfg_attr(test, assert_instr(vpermilps, IMM8 = 9))]
+#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
@@ -1055,7 +1055,7 @@ pub unsafe fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps)
 #[inline]
 #[target_feature(enable = "avx,sse")]
-#[cfg_attr(test, assert_instr(vpermilps, IMM8 = 9))]
+#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
@@ -1102,7 +1102,7 @@ pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_pd)
 #[inline]
 #[target_feature(enable = "avx")]
-#[cfg_attr(test, assert_instr(vpermilpd, IMM4 = 0x1))]
+#[cfg_attr(test, assert_instr(vshufpd, IMM4 = 0x1))]
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
@@ -1125,7 +1125,7 @@ pub unsafe fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd)
 #[inline]
 #[target_feature(enable = "avx,sse2")]
-#[cfg_attr(test, assert_instr(vpermilpd, IMM2 = 0x1))]
+#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0x1))]
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
@@ -1439,7 +1439,7 @@ pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
 #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
-    storeupd256(mem_addr, a);
+    mem_addr.cast::<__m256d>().write_unaligned(a);
 }
 
 /// Loads 256-bits (composed of 8 packed single-precision (32-bit)
@@ -1471,7 +1471,7 @@ pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
 #[cfg_attr(test, assert_instr(vmovups))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
-    storeups256(mem_addr, a);
+    mem_addr.cast::<__m256>().write_unaligned(a);
 }
 
 /// Loads 256-bits of integer data from memory into result.
@@ -1519,7 +1519,7 @@ pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
 }
 
 /// Stores 256-bits of integer data from `a` into memory.
-/// 	`mem_addr` does not need to be aligned on any particular boundary.
+/// `mem_addr` does not need to be aligned on any particular boundary.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_si256)
 #[inline]
@@ -1527,7 +1527,7 @@ pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
-    storeudq256(mem_addr as *mut i8, a.as_i8x32());
+    mem_addr.write_unaligned(a);
 }
 
 /// Loads packed double-precision (64-bit) floating-point elements from memory
@@ -2974,12 +2974,6 @@ extern "C" {
     fn vbroadcastf128ps256(a: &__m128) -> __m256;
     #[link_name = "llvm.x86.avx.vbroadcastf128.pd.256"]
     fn vbroadcastf128pd256(a: &__m128d) -> __m256d;
-    #[link_name = "llvm.x86.avx.storeu.pd.256"]
-    fn storeupd256(mem_addr: *mut f64, a: __m256d);
-    #[link_name = "llvm.x86.avx.storeu.ps.256"]
-    fn storeups256(mem_addr: *mut f32, a: __m256);
-    #[link_name = "llvm.x86.avx.storeu.dq.256"]
-    fn storeudq256(mem_addr: *mut i8, a: i8x32);
     #[link_name = "llvm.x86.avx.maskload.pd.256"]
     fn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d;
     #[link_name = "llvm.x86.avx.maskstore.pd.256"]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs
index cdf84b382..e23c795ee 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs
@@ -2373,7 +2373,7 @@ pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
-#[cfg_attr(test, assert_instr(vpermilps, MASK = 9))]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 9))]
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
@@ -2557,7 +2557,11 @@ pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(pslliw(a.as_i16x16(), IMM8))
+    if IMM8 >= 16 {
+        _mm256_setzero_si256()
+    } else {
+        transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
+    }
 }
 
 /// Shifts packed 32-bit integers in `a` left by `IMM8` while
@@ -2571,7 +2575,11 @@ pub unsafe fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(psllid(a.as_i32x8(), IMM8))
+    if IMM8 >= 32 {
+        _mm256_setzero_si256()
+    } else {
+        transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
+    }
 }
 
 /// Shifts packed 64-bit integers in `a` left by `IMM8` while
@@ -2585,7 +2593,11 @@ pub unsafe fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(pslliq(a.as_i64x4(), IMM8))
+    if IMM8 >= 64 {
+        _mm256_setzero_si256()
+    } else {
+        transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
+    }
 }
 
 /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
@@ -2749,7 +2761,7 @@ pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(psraiw(a.as_i16x16(), IMM8))
+    transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16)))
 }
 
 /// Shifts packed 32-bit integers in `a` right by `IMM8` while
@@ -2763,7 +2775,7 @@ pub unsafe fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(psraid(a.as_i32x8(), IMM8))
+    transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31))))
 }
 
 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
@@ -2996,7 +3008,11 @@ pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(psrliw(a.as_i16x16(), IMM8))
+    if IMM8 >= 16 {
+        _mm256_setzero_si256()
+    } else {
+        transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
+    }
 }
 
 /// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
@@ -3010,7 +3026,11 @@ pub unsafe fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(psrlid(a.as_i32x8(), IMM8))
+    if IMM8 >= 32 {
+        _mm256_setzero_si256()
+    } else {
+        transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
+    }
 }
 
 /// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
@@ -3024,7 +3044,11 @@ pub unsafe fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(psrliq(a.as_i64x4(), IMM8))
+    if IMM8 >= 64 {
+        _mm256_setzero_si256()
+    } else {
+        transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
+    }
 }
 
 /// Shifts packed 32-bit integers in `a` right by the amount specified by
@@ -3677,12 +3701,6 @@ extern "C" {
     fn pslld(a: i32x8, count: i32x4) -> i32x8;
     #[link_name = "llvm.x86.avx2.psll.q"]
     fn psllq(a: i64x4, count: i64x2) -> i64x4;
-    #[link_name = "llvm.x86.avx2.pslli.w"]
-    fn pslliw(a: i16x16, imm8: i32) -> i16x16;
-    #[link_name = "llvm.x86.avx2.pslli.d"]
-    fn psllid(a: i32x8, imm8: i32) -> i32x8;
-    #[link_name = "llvm.x86.avx2.pslli.q"]
-    fn pslliq(a: i64x4, imm8: i32) -> i64x4;
     #[link_name = "llvm.x86.avx2.psllv.d"]
     fn psllvd(a: i32x4, count: i32x4) -> i32x4;
     #[link_name = "llvm.x86.avx2.psllv.d.256"]
@@ -3695,10 +3713,6 @@ extern "C" {
     fn psraw(a: i16x16, count: i16x8) -> i16x16;
     #[link_name = "llvm.x86.avx2.psra.d"]
     fn psrad(a: i32x8, count: i32x4) -> i32x8;
-    #[link_name = "llvm.x86.avx2.psrai.w"]
-    fn psraiw(a: i16x16, imm8: i32) -> i16x16;
-    #[link_name = "llvm.x86.avx2.psrai.d"]
-    fn psraid(a: i32x8, imm8: i32) -> i32x8;
     #[link_name = "llvm.x86.avx2.psrav.d"]
     fn psravd(a: i32x4, count: i32x4) -> i32x4;
     #[link_name = "llvm.x86.avx2.psrav.d.256"]
@@ -3709,12 +3723,6 @@ extern "C" {
     fn psrld(a: i32x8, count: i32x4) -> i32x8;
     #[link_name = "llvm.x86.avx2.psrl.q"]
     fn psrlq(a: i64x4, count: i64x2) -> i64x4;
-    #[link_name = "llvm.x86.avx2.psrli.w"]
-    fn psrliw(a: i16x16, imm8: i32) -> i16x16;
-    #[link_name = "llvm.x86.avx2.psrli.d"]
-    fn psrlid(a: i32x8, imm8: i32) -> i32x8;
-    #[link_name = "llvm.x86.avx2.psrli.q"]
-    fn psrliq(a: i64x4, imm8: i32) -> i64x4;
     #[link_name = "llvm.x86.avx2.psrlv.d"]
     fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
     #[link_name = "llvm.x86.avx2.psrlv.d.256"]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
index bc1e7ddfb..364023539 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
@@ -5339,9 +5339,11 @@ pub unsafe fn _mm_maskz_sll_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_slli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i16x32();
-    let r = vpslliw(a, IMM8);
-    transmute(r)
+    if IMM8 >= 16 {
+        _mm512_setzero_si512()
+    } else {
+        transmute(simd_shl(a.as_u16x32(), u16x32::splat(IMM8 as u16)))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5357,9 +5359,12 @@ pub unsafe fn _mm512_mask_slli_epi16<const IMM8: u32>(
     a: __m512i,
 ) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i16x32();
-    let shf = vpslliw(a, IMM8);
-    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    let shf = if IMM8 >= 16 {
+        u16x32::splat(0)
+    } else {
+        simd_shl(a.as_u16x32(), u16x32::splat(IMM8 as u16))
+    };
+    transmute(simd_select_bitmask(k, shf, src.as_u16x32()))
 }
 
 /// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5371,10 +5376,13 @@ pub unsafe fn _mm512_mask_slli_epi16<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm512_maskz_slli_epi16<const IMM8: u32>(k: __mmask32, a: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i16x32();
-    let shf = vpslliw(a, IMM8);
-    let zero = _mm512_setzero_si512().as_i16x32();
-    transmute(simd_select_bitmask(k, shf, zero))
+    if IMM8 >= 16 {
+        _mm512_setzero_si512()
+    } else {
+        let shf = simd_shl(a.as_u16x32(), u16x32::splat(IMM8 as u16));
+        let zero = u16x32::splat(0);
+        transmute(simd_select_bitmask(k, shf, zero))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5390,9 +5398,12 @@ pub unsafe fn _mm256_mask_slli_epi16<const IMM8: u32>(
     a: __m256i,
 ) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = pslliw256(a.as_i16x16(), imm8);
-    transmute(simd_select_bitmask(k, r, src.as_i16x16()))
+    let shf = if IMM8 >= 16 {
+        u16x16::splat(0)
+    } else {
+        simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16))
+    };
+    transmute(simd_select_bitmask(k, shf, src.as_u16x16()))
 }
 
 /// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5404,10 +5415,13 @@ pub unsafe fn _mm256_mask_slli_epi16<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm256_maskz_slli_epi16<const IMM8: u32>(k: __mmask16, a: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = pslliw256(a.as_i16x16(), imm8);
-    let zero = _mm256_setzero_si256().as_i16x16();
-    transmute(simd_select_bitmask(k, r, zero))
+    if IMM8 >= 16 {
+        _mm256_setzero_si256()
+    } else {
+        let shf = simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16));
+        let zero = u16x16::splat(0);
+        transmute(simd_select_bitmask(k, shf, zero))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5423,9 +5437,12 @@ pub unsafe fn _mm_mask_slli_epi16<const IMM8: u32>(
     a: __m128i,
 ) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = pslliw128(a.as_i16x8(), imm8);
-    transmute(simd_select_bitmask(k, r, src.as_i16x8()))
+    let shf = if IMM8 >= 16 {
+        u16x8::splat(0)
+    } else {
+        simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16))
+    };
+    transmute(simd_select_bitmask(k, shf, src.as_u16x8()))
 }
 
 /// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5437,10 +5454,13 @@ pub unsafe fn _mm_mask_slli_epi16<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_maskz_slli_epi16<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = pslliw128(a.as_i16x8(), imm8);
-    let zero = _mm_setzero_si128().as_i16x8();
-    transmute(simd_select_bitmask(k, r, zero))
+    if IMM8 >= 16 {
+        _mm_setzero_si128()
+    } else {
+        let shf = simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16));
+        let zero = u16x8::splat(0);
+        transmute(simd_select_bitmask(k, shf, zero))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
@@ -5655,9 +5675,11 @@ pub unsafe fn _mm_maskz_srl_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_srli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i16x32();
-    let r = vpsrliw(a, IMM8);
-    transmute(r)
+    if IMM8 >= 16 {
+        _mm512_setzero_si512()
+    } else {
+        transmute(simd_shr(a.as_u16x32(), u16x32::splat(IMM8 as u16)))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5673,9 +5695,12 @@ pub unsafe fn _mm512_mask_srli_epi16<const IMM8: u32>(
     a: __m512i,
 ) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i16x32();
-    let shf = vpsrliw(a, IMM8);
-    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    let shf = if IMM8 >= 16 {
+        u16x32::splat(0)
+    } else {
+        simd_shr(a.as_u16x32(), u16x32::splat(IMM8 as u16))
+    };
+    transmute(simd_select_bitmask(k, shf, src.as_u16x32()))
 }
 
 /// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5688,10 +5713,13 @@ pub unsafe fn _mm512_mask_srli_epi16<const IMM8: u32>(
 pub unsafe fn _mm512_maskz_srli_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
     //imm8 should be u32, it seems the document to verify is incorrect
-    let a = a.as_i16x32();
-    let shf = vpsrliw(a, IMM8 as u32);
-    let zero = _mm512_setzero_si512().as_i16x32();
-    transmute(simd_select_bitmask(k, shf, zero))
+    if IMM8 >= 16 {
+        _mm512_setzero_si512()
+    } else {
+        let shf = simd_shr(a.as_u16x32(), u16x32::splat(IMM8 as u16));
+        let zero = u16x32::splat(0);
+        transmute(simd_select_bitmask(k, shf, zero))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5968,9 +5996,7 @@ pub unsafe fn _mm_maskz_sra_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_srai_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i16x32();
-    let r = vpsraiw(a, IMM8);
-    transmute(r)
+    transmute(simd_shr(a.as_i16x32(), i16x32::splat(IMM8.min(15) as i16)))
 }
 
 /// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5986,8 +6012,7 @@ pub unsafe fn _mm512_mask_srai_epi16<const IMM8: u32>(
     a: __m512i,
 ) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i16x32();
-    let shf = vpsraiw(a, IMM8);
+    let shf = simd_shr(a.as_i16x32(), i16x32::splat(IMM8.min(15) as i16));
     transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
 }
 
@@ -6000,9 +6025,8 @@ pub unsafe fn _mm512_mask_srai_epi16<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm512_maskz_srai_epi16<const IMM8: u32>(k: __mmask32, a: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i16x32();
-    let shf = vpsraiw(a, IMM8);
-    let zero = _mm512_setzero_si512().as_i16x32();
+    let shf = simd_shr(a.as_i16x32(), i16x32::splat(IMM8.min(15) as i16));
+    let zero = i16x32::splat(0);
     transmute(simd_select_bitmask(k, shf, zero))
 }
 
@@ -6019,8 +6043,7 @@ pub unsafe fn _mm256_mask_srai_epi16<const IMM8: u32>(
     a: __m256i,
 ) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = psraiw256(a.as_i16x16(), imm8);
+    let r = simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16));
     transmute(simd_select_bitmask(k, r, src.as_i16x16()))
 }
 
@@ -6033,9 +6056,8 @@ pub unsafe fn _mm256_mask_srai_epi16<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm256_maskz_srai_epi16<const IMM8: u32>(k: __mmask16, a: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = psraiw256(a.as_i16x16(), imm8);
-    let zero = _mm256_setzero_si256().as_i16x16();
+    let r = simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16));
+    let zero = i16x16::splat(0);
     transmute(simd_select_bitmask(k, r, zero))
 }
 
@@ -6052,8 +6074,7 @@ pub unsafe fn _mm_mask_srai_epi16<const IMM8: u32>(
     a: __m128i,
 ) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = psraiw128(a.as_i16x8(), imm8);
+    let r = simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16));
     transmute(simd_select_bitmask(k, r, src.as_i16x8()))
 }
 
@@ -6066,9 +6087,8 @@ pub unsafe fn _mm_mask_srai_epi16<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_maskz_srai_epi16<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = psraiw128(a.as_i16x8(), imm8);
-    let zero = _mm_setzero_si128().as_i16x8();
+    let r = simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16));
+    let zero = i16x8::splat(0);
     transmute(simd_select_bitmask(k, r, zero))
 }
 
@@ -9750,7 +9770,7 @@ pub unsafe fn _mm_maskz_alignr_epi8<const IMM8: i32>(
 #[target_feature(enable = "avx512bw")]
 #[cfg_attr(test, assert_instr(vpmovswb))]
 pub unsafe fn _mm512_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
-    vpmovswbmem(mem_addr as *mut i8, a.as_i16x32(), k);
+    vpmovswbmem(mem_addr, a.as_i16x32(), k);
 }
 
 /// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -9760,7 +9780,7 @@ pub unsafe fn _mm512_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32,
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovswb))]
 pub unsafe fn _mm256_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
-    vpmovswbmem256(mem_addr as *mut i8, a.as_i16x16(), k);
+    vpmovswbmem256(mem_addr, a.as_i16x16(), k);
 }
 
 /// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -9770,7 +9790,7 @@ pub unsafe fn _mm256_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16,
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovswb))]
 pub unsafe fn _mm_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovswbmem128(mem_addr as *mut i8, a.as_i16x8(), k);
+    vpmovswbmem128(mem_addr, a.as_i16x8(), k);
 }
 
 /// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -9780,7 +9800,7 @@ pub unsafe fn _mm_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a:
 #[target_feature(enable = "avx512bw")]
 #[cfg_attr(test, assert_instr(vpmovwb))]
 pub unsafe fn _mm512_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
-    vpmovwbmem(mem_addr as *mut i8, a.as_i16x32(), k);
+    vpmovwbmem(mem_addr, a.as_i16x32(), k);
 }
 
 /// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -9790,7 +9810,7 @@ pub unsafe fn _mm512_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32,
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovwb))]
 pub unsafe fn _mm256_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
-    vpmovwbmem256(mem_addr as *mut i8, a.as_i16x16(), k);
+    vpmovwbmem256(mem_addr, a.as_i16x16(), k);
 }
 
 /// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -9800,7 +9820,7 @@ pub unsafe fn _mm256_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16,
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovwb))]
 pub unsafe fn _mm_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovwbmem128(mem_addr as *mut i8, a.as_i16x8(), k);
+    vpmovwbmem128(mem_addr, a.as_i16x8(), k);
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -9810,7 +9830,7 @@ pub unsafe fn _mm_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: _
 #[target_feature(enable = "avx512bw")]
 #[cfg_attr(test, assert_instr(vpmovuswb))]
 pub unsafe fn _mm512_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
-    vpmovuswbmem(mem_addr as *mut i8, a.as_i16x32(), k);
+    vpmovuswbmem(mem_addr, a.as_i16x32(), k);
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -9820,7 +9840,7 @@ pub unsafe fn _mm512_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovuswb))]
 pub unsafe fn _mm256_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
-    vpmovuswbmem256(mem_addr as *mut i8, a.as_i16x16(), k);
+    vpmovuswbmem256(mem_addr, a.as_i16x16(), k);
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -9830,7 +9850,7 @@ pub unsafe fn _mm256_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovuswb))]
 pub unsafe fn _mm_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovuswbmem128(mem_addr as *mut i8, a.as_i16x8(), k);
+    vpmovuswbmem128(mem_addr, a.as_i16x8(), k);
 }
 
 #[allow(improper_ctypes)]
@@ -9965,13 +9985,6 @@ extern "C" {
 
     #[link_name = "llvm.x86.avx512.psll.w.512"]
     fn vpsllw(a: i16x32, count: i16x8) -> i16x32;
-    #[link_name = "llvm.x86.avx512.pslli.w.512"]
-    fn vpslliw(a: i16x32, imm8: u32) -> i16x32;
-
-    #[link_name = "llvm.x86.avx2.pslli.w"]
-    fn pslliw256(a: i16x16, imm8: i32) -> i16x16;
-    #[link_name = "llvm.x86.sse2.pslli.w"]
-    fn pslliw128(a: i16x8, imm8: i32) -> i16x8;
 
     #[link_name = "llvm.x86.avx512.psllv.w.512"]
     fn vpsllvw(a: i16x32, b: i16x32) -> i16x32;
@@ -9982,8 +9995,6 @@ extern "C" {
 
     #[link_name = "llvm.x86.avx512.psrl.w.512"]
     fn vpsrlw(a: i16x32, count: i16x8) -> i16x32;
-    #[link_name = "llvm.x86.avx512.psrli.w.512"]
-    fn vpsrliw(a: i16x32, imm8: u32) -> i16x32;
 
     #[link_name = "llvm.x86.avx512.psrlv.w.512"]
     fn vpsrlvw(a: i16x32, b: i16x32) -> i16x32;
@@ -9994,13 +10005,6 @@ extern "C" {
 
     #[link_name = "llvm.x86.avx512.psra.w.512"]
     fn vpsraw(a: i16x32, count: i16x8) -> i16x32;
-    #[link_name = "llvm.x86.avx512.psrai.w.512"]
-    fn vpsraiw(a: i16x32, imm8: u32) -> i16x32;
-
-    #[link_name = "llvm.x86.avx2.psrai.w"]
-    fn psraiw256(a: i16x16, imm8: i32) -> i16x16;
-    #[link_name = "llvm.x86.sse2.psrai.w"]
-    fn psraiw128(a: i16x8, imm8: i32) -> i16x8;
 
     #[link_name = "llvm.x86.avx512.psrav.w.512"]
     fn vpsravw(a: i16x32, count: i16x32) -> i16x32;
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
index 9baa7eeca..5412237ca 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
@@ -17141,9 +17141,11 @@ pub unsafe fn _mm_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> _
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_slli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x16();
-    let r = vpsllid(a, IMM8);
-    transmute(r)
+    if IMM8 >= 32 {
+        _mm512_setzero_si512()
+    } else {
+        transmute(simd_shl(a.as_u32x16(), u32x16::splat(IMM8 as u32)))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17159,9 +17161,12 @@ pub unsafe fn _mm512_mask_slli_epi32<const IMM8: u32>(
     a: __m512i,
 ) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x16();
-    let shf = vpsllid(a, IMM8);
-    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    let shf = if IMM8 >= 32 {
+        u32x16::splat(0)
+    } else {
+        simd_shl(a.as_u32x16(), u32x16::splat(IMM8))
+    };
+    transmute(simd_select_bitmask(k, shf, src.as_u32x16()))
 }
 
 /// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17173,10 +17178,13 @@ pub unsafe fn _mm512_mask_slli_epi32<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm512_maskz_slli_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x16();
-    let shf = vpsllid(a, IMM8);
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, shf, zero))
+    if IMM8 >= 32 {
+        _mm512_setzero_si512()
+    } else {
+        let shf = simd_shl(a.as_u32x16(), u32x16::splat(IMM8));
+        let zero = u32x16::splat(0);
+        transmute(simd_select_bitmask(k, shf, zero))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17192,9 +17200,12 @@ pub unsafe fn _mm256_mask_slli_epi32<const IMM8: u32>(
     a: __m256i,
 ) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = psllid256(a.as_i32x8(), imm8);
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    let r = if IMM8 >= 32 {
+        u32x8::splat(0)
+    } else {
+        simd_shl(a.as_u32x8(), u32x8::splat(IMM8))
+    };
+    transmute(simd_select_bitmask(k, r, src.as_u32x8()))
 }
 
 /// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17206,10 +17217,13 @@ pub unsafe fn _mm256_mask_slli_epi32<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm256_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = psllid256(a.as_i32x8(), imm8);
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, r, zero))
+    if IMM8 >= 32 {
+        _mm256_setzero_si256()
+    } else {
+        let r = simd_shl(a.as_u32x8(), u32x8::splat(IMM8));
+        let zero = u32x8::splat(0);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17225,9 +17239,12 @@ pub unsafe fn _mm_mask_slli_epi32<const IMM8: u32>(
     a: __m128i,
 ) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = psllid128(a.as_i32x4(), imm8);
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    let r = if IMM8 >= 32 {
+        u32x4::splat(0)
+    } else {
+        simd_shl(a.as_u32x4(), u32x4::splat(IMM8))
+    };
+    transmute(simd_select_bitmask(k, r, src.as_u32x4()))
 }
 
 /// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17239,10 +17256,13 @@ pub unsafe fn _mm_mask_slli_epi32<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = psllid128(a.as_i32x4(), imm8);
-    let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, r, zero))
+    if IMM8 >= 32 {
+        _mm_setzero_si128()
+    } else {
+        let r = simd_shl(a.as_u32x4(), u32x4::splat(IMM8));
+        let zero = u32x4::splat(0);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
@@ -17254,9 +17274,11 @@ pub unsafe fn _mm_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) ->
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_srli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x16();
-    let r = vpsrlid(a, IMM8);
-    transmute(r)
+    if IMM8 >= 32 {
+        _mm512_setzero_si512()
+    } else {
+        transmute(simd_shr(a.as_u32x16(), u32x16::splat(IMM8)))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17272,9 +17294,12 @@ pub unsafe fn _mm512_mask_srli_epi32<const IMM8: u32>(
     a: __m512i,
 ) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x16();
-    let shf = vpsrlid(a, IMM8);
-    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    let shf = if IMM8 >= 32 {
+        u32x16::splat(0)
+    } else {
+        simd_shr(a.as_u32x16(), u32x16::splat(IMM8))
+    };
+    transmute(simd_select_bitmask(k, shf, src.as_u32x16()))
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17286,10 +17311,13 @@ pub unsafe fn _mm512_mask_srli_epi32<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm512_maskz_srli_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x16();
-    let shf = vpsrlid(a, IMM8);
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, shf, zero))
+    if IMM8 >= 32 {
+        _mm512_setzero_si512()
+    } else {
+        let shf = simd_shr(a.as_u32x16(), u32x16::splat(IMM8));
+        let zero = u32x16::splat(0);
+        transmute(simd_select_bitmask(k, shf, zero))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17305,9 +17333,12 @@ pub unsafe fn _mm256_mask_srli_epi32<const IMM8: u32>(
     a: __m256i,
 ) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = psrlid256(a.as_i32x8(), imm8);
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    let r = if IMM8 >= 32 {
+        u32x8::splat(0)
+    } else {
+        simd_shr(a.as_u32x8(), u32x8::splat(IMM8))
+    };
+    transmute(simd_select_bitmask(k, r, src.as_u32x8()))
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17319,10 +17350,13 @@ pub unsafe fn _mm256_mask_srli_epi32<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm256_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = psrlid256(a.as_i32x8(), imm8);
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, r, zero))
+    if IMM8 >= 32 {
+        _mm256_setzero_si256()
+    } else {
+        let r = simd_shr(a.as_u32x8(), u32x8::splat(IMM8));
+        let zero = u32x8::splat(0);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17338,9 +17372,12 @@ pub unsafe fn _mm_mask_srli_epi32<const IMM8: u32>(
     a: __m128i,
 ) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = psrlid128(a.as_i32x4(), imm8);
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    let r = if IMM8 >= 32 {
+        u32x4::splat(0)
+    } else {
+        simd_shr(a.as_u32x4(), u32x4::splat(IMM8))
+    };
+    transmute(simd_select_bitmask(k, r, src.as_u32x4()))
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17352,10 +17389,13 @@ pub unsafe fn _mm_mask_srli_epi32<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = psrlid128(a.as_i32x4(), imm8);
-    let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, r, zero))
+    if IMM8 >= 32 {
+        _mm_setzero_si128()
+    } else {
+        let r = simd_shr(a.as_u32x4(), u32x4::splat(IMM8));
+        let zero = u32x4::splat(0);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
@@ -17367,9 +17407,11 @@ pub unsafe fn _mm_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) ->
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_slli_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x8();
-    let r = vpslliq(a, IMM8);
-    transmute(r)
+    if IMM8 >= 64 {
+        _mm512_setzero_si512()
+    } else {
+        transmute(simd_shl(a.as_u64x8(), u64x8::splat(IMM8 as u64)))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17385,9 +17427,12 @@ pub unsafe fn _mm512_mask_slli_epi64<const IMM8: u32>(
     a: __m512i,
 ) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x8();
-    let shf = vpslliq(a, IMM8);
-    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    let shf = if IMM8 >= 64 {
+        u64x8::splat(0)
+    } else {
+        simd_shl(a.as_u64x8(), u64x8::splat(IMM8 as u64))
+    };
+    transmute(simd_select_bitmask(k, shf, src.as_u64x8()))
 }
 
 /// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17399,10 +17444,13 @@ pub unsafe fn _mm512_mask_slli_epi64<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm512_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x8();
-    let shf = vpslliq(a, IMM8);
-    let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, shf, zero))
+    if IMM8 >= 64 {
+        _mm512_setzero_si512()
+    } else {
+        let shf = simd_shl(a.as_u64x8(), u64x8::splat(IMM8 as u64));
+        let zero = u64x8::splat(0);
+        transmute(simd_select_bitmask(k, shf, zero))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17418,9 +17466,12 @@ pub unsafe fn _mm256_mask_slli_epi64<const IMM8: u32>(
     a: __m256i,
 ) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = pslliq256(a.as_i64x4(), imm8);
-    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+    let r = if IMM8 >= 64 {
+        u64x4::splat(0)
+    } else {
+        simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64))
+    };
+    transmute(simd_select_bitmask(k, r, src.as_u64x4()))
 }
 
 /// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17432,10 +17483,13 @@ pub unsafe fn _mm256_mask_slli_epi64<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm256_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = pslliq256(a.as_i64x4(), imm8);
-    let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, r, zero))
+    if IMM8 >= 64 {
+        _mm256_setzero_si256()
+    } else {
+        let r = simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64));
+        let zero = u64x4::splat(0);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17451,9 +17505,12 @@ pub unsafe fn _mm_mask_slli_epi64<const IMM8: u32>(
     a: __m128i,
 ) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = pslliq128(a.as_i64x2(), imm8);
-    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+    let r = if IMM8 >= 64 {
+        u64x2::splat(0)
+    } else {
+        simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64))
+    };
+    transmute(simd_select_bitmask(k, r, src.as_u64x2()))
 }
 
 /// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17465,10 +17522,13 @@ pub unsafe fn _mm_mask_slli_epi64<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = pslliq128(a.as_i64x2(), imm8);
-    let zero = _mm_setzero_si128().as_i64x2();
-    transmute(simd_select_bitmask(k, r, zero))
+    if IMM8 >= 64 {
+        _mm_setzero_si128()
+    } else {
+        let r = simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64));
+        let zero = u64x2::splat(0);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
@@ -17480,9 +17540,11 @@ pub unsafe fn _mm_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) ->
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_srli_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x8();
-    let r = vpsrliq(a, IMM8);
-    transmute(r)
+    if IMM8 >= 64 {
+        _mm512_setzero_si512()
+    } else {
+        transmute(simd_shr(a.as_u64x8(), u64x8::splat(IMM8 as u64)))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17498,9 +17560,12 @@ pub unsafe fn _mm512_mask_srli_epi64<const IMM8: u32>(
     a: __m512i,
 ) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x8();
-    let shf = vpsrliq(a, IMM8);
-    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    let shf = if IMM8 >= 64 {
+        u64x8::splat(0)
+    } else {
+        simd_shr(a.as_u64x8(), u64x8::splat(IMM8 as u64))
+    };
+    transmute(simd_select_bitmask(k, shf, src.as_u64x8()))
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17512,10 +17577,13 @@ pub unsafe fn _mm512_mask_srli_epi64<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm512_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x8();
-    let shf = vpsrliq(a, IMM8);
-    let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, shf, zero))
+    if IMM8 >= 64 {
+        _mm512_setzero_si512()
+    } else {
+        let shf = simd_shr(a.as_u64x8(), u64x8::splat(IMM8 as u64));
+        let zero = u64x8::splat(0);
+        transmute(simd_select_bitmask(k, shf, zero))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17531,9 +17599,12 @@ pub unsafe fn _mm256_mask_srli_epi64<const IMM8: u32>(
     a: __m256i,
 ) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = psrliq256(a.as_i64x4(), imm8);
-    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+    let r = if IMM8 >= 64 {
+        u64x4::splat(0)
+    } else {
+        simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64))
+    };
+    transmute(simd_select_bitmask(k, r, src.as_u64x4()))
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17545,10 +17616,13 @@ pub unsafe fn _mm256_mask_srli_epi64<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm256_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = psrliq256(a.as_i64x4(), imm8);
-    let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, r, zero))
+    if IMM8 >= 64 {
+        _mm256_setzero_si256()
+    } else {
+        let r = simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64));
+        let zero = u64x4::splat(0);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17564,9 +17638,12 @@ pub unsafe fn _mm_mask_srli_epi64<const IMM8: u32>(
     a: __m128i,
 ) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = psrliq128(a.as_i64x2(), imm8);
-    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+    let r = if IMM8 >= 64 {
+        u64x2::splat(0)
+    } else {
+        simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64))
+    };
+    transmute(simd_select_bitmask(k, r, src.as_u64x2()))
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17578,10 +17655,13 @@ pub unsafe fn _mm_mask_srli_epi64<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    let imm8 = IMM8 as i32;
-    let r = psrliq128(a.as_i64x2(), imm8);
-    let zero = _mm_setzero_si128().as_i64x2();
-    transmute(simd_select_bitmask(k, r, zero))
+    if IMM8 >= 64 {
+        _mm_setzero_si128()
+    } else {
+        let r = simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64));
+        let zero = u64x2::splat(0);
+        transmute(simd_select_bitmask(k, r, zero))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst.
@@ -18147,9 +18227,7 @@ pub unsafe fn _mm_maskz_sra_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_srai_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x16();
-    let r = vpsraid512(a, IMM8);
-    transmute(r)
+    transmute(simd_shr(a.as_i32x16(), i32x16::splat(IMM8.min(31) as i32)))
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18165,8 +18243,7 @@ pub unsafe fn _mm512_mask_srai_epi32<const IMM8: u32>(
     a: __m512i,
 ) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x16();
-    let r = vpsraid512(a, IMM8);
+    let r = simd_shr(a.as_i32x16(), i32x16::splat(IMM8.min(31) as i32));
     transmute(simd_select_bitmask(k, r, src.as_i32x16()))
 }
 
@@ -18179,9 +18256,8 @@ pub unsafe fn _mm512_mask_srai_epi32<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm512_maskz_srai_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x16();
-    let r = vpsraid512(a, IMM8);
-    let zero = _mm512_setzero_si512().as_i32x16();
+    let r = simd_shr(a.as_i32x16(), i32x16::splat(IMM8.min(31) as i32));
+    let zero = i32x16::splat(0);
     transmute(simd_select_bitmask(k, r, zero))
 }
 
@@ -18197,8 +18273,7 @@ pub unsafe fn _mm256_mask_srai_epi32<const IMM8: u32>(
     k: __mmask8,
     a: __m256i,
 ) -> __m256i {
-    let imm8 = IMM8 as i32;
-    let r = psraid256(a.as_i32x8(), imm8);
+    let r = simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31) as i32));
     transmute(simd_select_bitmask(k, r, src.as_i32x8()))
 }
 
@@ -18210,9 +18285,8 @@ pub unsafe fn _mm256_mask_srai_epi32<const IMM8: u32>(
 #[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm256_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
-    let imm8 = IMM8 as i32;
-    let r = psraid256(a.as_i32x8(), imm8);
-    let zero = _mm256_setzero_si256().as_i32x8();
+    let r = simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31) as i32));
+    let zero = i32x8::splat(0);
     transmute(simd_select_bitmask(k, r, zero))
 }
 
@@ -18228,8 +18302,7 @@ pub unsafe fn _mm_mask_srai_epi32<const IMM8: u32>(
     k: __mmask8,
     a: __m128i,
 ) -> __m128i {
-    let imm8 = IMM8 as i32;
-    let r = psraid128(a.as_i32x4(), imm8);
+    let r = simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31) as i32));
     transmute(simd_select_bitmask(k, r, src.as_i32x4()))
 }
 
@@ -18241,9 +18314,8 @@ pub unsafe fn _mm_mask_srai_epi32<const IMM8: u32>(
 #[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
-    let imm8 = IMM8 as i32;
-    let r = psraid128(a.as_i32x4(), imm8);
-    let zero = _mm_setzero_si128().as_i32x4();
+    let r = simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31) as i32));
+    let zero = i32x4::splat(0);
     transmute(simd_select_bitmask(k, r, zero))
 }
 
@@ -18256,9 +18328,7 @@ pub unsafe fn _mm_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) ->
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_srai_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x8();
-    let r = vpsraiq(a, IMM8);
-    transmute(r)
+    transmute(simd_shr(a.as_i64x8(), i64x8::splat(IMM8.min(63) as i64)))
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18274,8 +18344,7 @@ pub unsafe fn _mm512_mask_srai_epi64<const IMM8: u32>(
     a: __m512i,
 ) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x8();
-    let shf = vpsraiq(a, IMM8);
+    let shf = simd_shr(a.as_i64x8(), i64x8::splat(IMM8.min(63) as i64));
     transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
 }
 
@@ -18288,9 +18357,8 @@ pub unsafe fn _mm512_mask_srai_epi64<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm512_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x8();
-    let shf = vpsraiq(a, IMM8);
-    let zero = _mm512_setzero_si512().as_i64x8();
+    let shf = simd_shr(a.as_i64x8(), i64x8::splat(IMM8.min(63) as i64));
+    let zero = i64x8::splat(0);
     transmute(simd_select_bitmask(k, shf, zero))
 }
 
@@ -18303,9 +18371,7 @@ pub unsafe fn _mm512_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m512i)
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm256_srai_epi64<const IMM8: u32>(a: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x4();
-    let r = vpsraiq256(a, IMM8);
-    transmute(r)
+    transmute(simd_shr(a.as_i64x4(), i64x4::splat(IMM8.min(63) as i64)))
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18321,8 +18387,7 @@ pub unsafe fn _mm256_mask_srai_epi64<const IMM8: u32>(
     a: __m256i,
 ) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x4();
-    let shf = vpsraiq256(a, IMM8);
+    let shf = simd_shr(a.as_i64x4(), i64x4::splat(IMM8.min(63) as i64));
     transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
 }
 
@@ -18335,9 +18400,8 @@ pub unsafe fn _mm256_mask_srai_epi64<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm256_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x4();
-    let shf = vpsraiq256(a, IMM8);
-    let zero = _mm256_setzero_si256().as_i64x4();
+    let shf = simd_shr(a.as_i64x4(), i64x4::splat(IMM8.min(63) as i64));
+    let zero = i64x4::splat(0);
     transmute(simd_select_bitmask(k, shf, zero))
 }
 
@@ -18350,9 +18414,7 @@ pub unsafe fn _mm256_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m256i)
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_srai_epi64<const IMM8: u32>(a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x2();
-    let r = vpsraiq128(a, IMM8);
-    transmute(r)
+    transmute(simd_shr(a.as_i64x2(), i64x2::splat(IMM8.min(63) as i64)))
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18368,8 +18430,7 @@ pub unsafe fn _mm_mask_srai_epi64<const IMM8: u32>(
     a: __m128i,
 ) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x2();
-    let shf = vpsraiq128(a, IMM8);
+    let shf = simd_shr(a.as_i64x2(), i64x2::splat(IMM8.min(63) as i64));
     transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
 }
 
@@ -18382,9 +18443,8 @@ pub unsafe fn _mm_mask_srai_epi64<const IMM8: u32>(
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x2();
-    let shf = vpsraiq128(a, IMM8);
-    let zero = _mm_setzero_si128().as_i64x2();
+    let shf = simd_shr(a.as_i64x2(), i64x2::splat(IMM8.min(63) as i64));
+    let zero = i64x2::splat(0);
     transmute(simd_select_bitmask(k, shf, zero))
 }
 
@@ -19383,7 +19443,7 @@ pub unsafe fn _mm_maskz_srlv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> _
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permute_ps&expand=4170)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_permute_ps<const MASK: i32>(a: __m512) -> __m512 {
     static_assert_uimm_bits!(MASK, 8);
@@ -19416,7 +19476,7 @@ pub unsafe fn _mm512_permute_ps<const MASK: i32>(a: __m512) -> __m512 {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permute_ps&expand=4168)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_mask_permute_ps<const MASK: i32>(
     src: __m512,
@@ -19433,7 +19493,7 @@ pub unsafe fn _mm512_mask_permute_ps<const MASK: i32>(
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permute_ps&expand=4169)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm512_maskz_permute_ps<const MASK: i32>(k: __mmask16, a: __m512) -> __m512 {
     static_assert_uimm_bits!(MASK, 8);
@@ -19447,7 +19507,7 @@ pub unsafe fn _mm512_maskz_permute_ps<const MASK: i32>(k: __mmask16, a: __m512)
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permute_ps&expand=4165)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm256_mask_permute_ps<const MASK: i32>(
     src: __m256,
@@ -19463,7 +19523,7 @@ pub unsafe fn _mm256_mask_permute_ps<const MASK: i32>(
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permute_ps&expand=4166)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm256_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m256) -> __m256 {
     let r = _mm256_permute_ps::<MASK>(a);
@@ -19476,7 +19536,7 @@ pub unsafe fn _mm256_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m256) -
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permute_ps&expand=4162)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm_mask_permute_ps<const MASK: i32>(src: __m128, k: __mmask8, a: __m128) -> __m128 {
     let r = _mm_permute_ps::<MASK>(a);
@@ -19488,7 +19548,7 @@ pub unsafe fn _mm_mask_permute_ps<const MASK: i32>(src: __m128, k: __mmask8, a:
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permute_ps&expand=4163)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
+#[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m128) -> __m128 {
     let r = _mm_permute_ps::<MASK>(a);
@@ -19501,7 +19561,7 @@ pub unsafe fn _mm_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m128) -> _
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_permute_pd&expand=4161)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpermilpd, MASK = 0b11_01_10_01))]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01_10_01))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_permute_pd<const MASK: i32>(a: __m512d) -> __m512d {
     static_assert_uimm_bits!(MASK, 8);
@@ -19526,7 +19586,7 @@ pub unsafe fn _mm512_permute_pd<const MASK: i32>(a: __m512d) -> __m512d {
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_permute_pd&expand=4159)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpermilpd, MASK = 0b11_01_10_01))]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01_10_01))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_mask_permute_pd<const MASK: i32>(
     src: __m512d,
@@ -19543,7 +19603,7 @@ pub unsafe fn _mm512_mask_permute_pd<const MASK: i32>(
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_permute_pd&expand=4160)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpermilpd, MASK = 0b11_01_10_01))]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01_10_01))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm512_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m512d) -> __m512d {
     static_assert_uimm_bits!(MASK, 8);
@@ -19557,7 +19617,7 @@ pub unsafe fn _mm512_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m512d)
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_permute_pd&expand=4156)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpermilpd, MASK = 0b11_01))]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm256_mask_permute_pd<const MASK: i32>(
     src: __m256d,
@@ -19574,7 +19634,7 @@ pub unsafe fn _mm256_mask_permute_pd<const MASK: i32>(
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_permute_pd&expand=4157)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpermilpd, MASK = 0b11_01))]
+#[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm256_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m256d) -> __m256d {
     static_assert_uimm_bits!(MASK, 4);
@@ -19588,7 +19648,7 @@ pub unsafe fn _mm256_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m256d)
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_permute_pd&expand=4153)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpermilpd, IMM2 = 0b01))]
+#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0b01))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm_mask_permute_pd<const IMM2: i32>(
     src: __m128d,
@@ -19605,7 +19665,7 @@ pub unsafe fn _mm_mask_permute_pd<const IMM2: i32>(
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_permute_pd&expand=4154)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpermilpd, IMM2 = 0b01))]
+#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0b01))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_maskz_permute_pd<const IMM2: i32>(k: __mmask8, a: __m128d) -> __m128d {
     static_assert_uimm_bits!(IMM2, 2);
@@ -21035,7 +21095,7 @@ pub unsafe fn _mm_mask2_permutex2var_pd(
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=512_shuffle_epi32&expand=5150)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpermilps, MASK = 9))] //should be vpshufd
+#[cfg_attr(test, assert_instr(vshufps, MASK = 9))] //should be vpshufd
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm512_shuffle_epi32<const MASK: _MM_PERM_ENUM>(a: __m512i) -> __m512i {
     static_assert_uimm_bits!(MASK, 8);
@@ -29721,7 +29781,7 @@ pub unsafe fn _mm_loadu_epi32(mem_addr: *const i32) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpmovdw))]
 pub unsafe fn _mm512_mask_cvtepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
-    vpmovdwmem(mem_addr as *mut i8, a.as_i32x16(), k);
+    vpmovdwmem(mem_addr, a.as_i32x16(), k);
 }
 
 /// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29731,7 +29791,7 @@ pub unsafe fn _mm512_mask_cvtepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask16,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovdw))]
 pub unsafe fn _mm256_mask_cvtepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovdwmem256(mem_addr as *mut i8, a.as_i32x8(), k);
+    vpmovdwmem256(mem_addr, a.as_i32x8(), k);
 }
 
 /// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29741,7 +29801,7 @@ pub unsafe fn _mm256_mask_cvtepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovdw))]
 pub unsafe fn _mm_mask_cvtepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovdwmem128(mem_addr as *mut i8, a.as_i32x4(), k);
+    vpmovdwmem128(mem_addr, a.as_i32x4(), k);
 }
 
 /// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29751,7 +29811,7 @@ pub unsafe fn _mm_mask_cvtepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a:
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpmovsdw))]
 pub unsafe fn _mm512_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
-    vpmovsdwmem(mem_addr as *mut i8, a.as_i32x16(), k);
+    vpmovsdwmem(mem_addr, a.as_i32x16(), k);
 }
 
 /// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29761,7 +29821,7 @@ pub unsafe fn _mm512_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask16
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovsdw))]
 pub unsafe fn _mm256_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovsdwmem256(mem_addr as *mut i8, a.as_i32x8(), k);
+    vpmovsdwmem256(mem_addr, a.as_i32x8(), k);
 }
 
 /// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29771,7 +29831,7 @@ pub unsafe fn _mm256_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovsdw))]
 pub unsafe fn _mm_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovsdwmem128(mem_addr as *mut i8, a.as_i32x4(), k);
+    vpmovsdwmem128(mem_addr, a.as_i32x4(), k);
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29781,7 +29841,7 @@ pub unsafe fn _mm_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a:
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpmovusdw))]
 pub unsafe fn _mm512_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
-    vpmovusdwmem(mem_addr as *mut i8, a.as_i32x16(), k);
+    vpmovusdwmem(mem_addr, a.as_i32x16(), k);
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29791,7 +29851,7 @@ pub unsafe fn _mm512_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask1
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovusdw))]
 pub unsafe fn _mm256_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovusdwmem256(mem_addr as *mut i8, a.as_i32x8(), k);
+    vpmovusdwmem256(mem_addr, a.as_i32x8(), k);
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29801,7 +29861,7 @@ pub unsafe fn _mm256_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovusdw))]
 pub unsafe fn _mm_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovusdwmem128(mem_addr as *mut i8, a.as_i32x4(), k);
+    vpmovusdwmem128(mem_addr, a.as_i32x4(), k);
 }
 
 /// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29811,7 +29871,7 @@ pub unsafe fn _mm_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpmovdb))]
 pub unsafe fn _mm512_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
-    vpmovdbmem(mem_addr as *mut i8, a.as_i32x16(), k);
+    vpmovdbmem(mem_addr, a.as_i32x16(), k);
 }
 
 /// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29821,7 +29881,7 @@ pub unsafe fn _mm512_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovdb))]
 pub unsafe fn _mm256_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovdbmem256(mem_addr as *mut i8, a.as_i32x8(), k);
+    vpmovdbmem256(mem_addr, a.as_i32x8(), k);
 }
 
 /// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29831,7 +29891,7 @@ pub unsafe fn _mm256_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovdb))]
 pub unsafe fn _mm_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovdbmem128(mem_addr as *mut i8, a.as_i32x4(), k);
+    vpmovdbmem128(mem_addr, a.as_i32x4(), k);
 }
 
 /// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29841,7 +29901,7 @@ pub unsafe fn _mm_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: _
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpmovsdb))]
 pub unsafe fn _mm512_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
-    vpmovsdbmem(mem_addr as *mut i8, a.as_i32x16(), k);
+    vpmovsdbmem(mem_addr, a.as_i32x16(), k);
 }
 
 /// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29851,7 +29911,7 @@ pub unsafe fn _mm512_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovsdb))]
 pub unsafe fn _mm256_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovsdbmem256(mem_addr as *mut i8, a.as_i32x8(), k);
+    vpmovsdbmem256(mem_addr, a.as_i32x8(), k);
 }
 
 /// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29861,7 +29921,7 @@ pub unsafe fn _mm256_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovsdb))]
 pub unsafe fn _mm_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovsdbmem128(mem_addr as *mut i8, a.as_i32x4(), k);
+    vpmovsdbmem128(mem_addr, a.as_i32x4(), k);
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29871,7 +29931,7 @@ pub unsafe fn _mm_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a:
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpmovusdb))]
 pub unsafe fn _mm512_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
-    vpmovusdbmem(mem_addr as *mut i8, a.as_i32x16(), k);
+    vpmovusdbmem(mem_addr, a.as_i32x16(), k);
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29881,7 +29941,7 @@ pub unsafe fn _mm512_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovusdb))]
 pub unsafe fn _mm256_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovusdbmem256(mem_addr as *mut i8, a.as_i32x8(), k);
+    vpmovusdbmem256(mem_addr, a.as_i32x8(), k);
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29891,7 +29951,7 @@ pub unsafe fn _mm256_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovusdb))]
 pub unsafe fn _mm_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovusdbmem128(mem_addr as *mut i8, a.as_i32x4(), k);
+    vpmovusdbmem128(mem_addr, a.as_i32x4(), k);
 }
 
 /// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29901,7 +29961,7 @@ pub unsafe fn _mm_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a:
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpmovqw))]
 pub unsafe fn _mm512_mask_cvtepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
-    vpmovqwmem(mem_addr as *mut i8, a.as_i64x8(), k);
+    vpmovqwmem(mem_addr, a.as_i64x8(), k);
 }
 
 /// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29911,7 +29971,7 @@ pub unsafe fn _mm512_mask_cvtepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovqw))]
 pub unsafe fn _mm256_mask_cvtepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovqwmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+    vpmovqwmem256(mem_addr, a.as_i64x4(), k);
 }
 
 /// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29921,7 +29981,7 @@ pub unsafe fn _mm256_mask_cvtepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovqw))]
 pub unsafe fn _mm_mask_cvtepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovqwmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+    vpmovqwmem128(mem_addr, a.as_i64x2(), k);
 }
 
 /// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29931,7 +29991,7 @@ pub unsafe fn _mm_mask_cvtepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a:
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpmovsqw))]
 pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
-    vpmovsqwmem(mem_addr as *mut i8, a.as_i64x8(), k);
+    vpmovsqwmem(mem_addr, a.as_i64x8(), k);
 }
 
 /// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29941,7 +30001,7 @@ pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovsqw))]
 pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovsqwmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+    vpmovsqwmem256(mem_addr, a.as_i64x4(), k);
 }
 
 /// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29951,7 +30011,7 @@ pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovsqw))]
 pub unsafe fn _mm_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovsqwmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+    vpmovsqwmem128(mem_addr, a.as_i64x2(), k);
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29961,7 +30021,7 @@ pub unsafe fn _mm_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a:
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpmovusqw))]
 pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
-    vpmovusqwmem(mem_addr as *mut i8, a.as_i64x8(), k);
+    vpmovusqwmem(mem_addr, a.as_i64x8(), k);
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29971,7 +30031,7 @@ pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovusqw))]
 pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovusqwmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+    vpmovusqwmem256(mem_addr, a.as_i64x4(), k);
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29981,7 +30041,7 @@ pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovusqw))]
 pub unsafe fn _mm_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovusqwmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+    vpmovusqwmem128(mem_addr, a.as_i64x2(), k);
 }
 
 /// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -29991,7 +30051,7 @@ pub unsafe fn _mm_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpmovqb))]
 pub unsafe fn _mm512_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
-    vpmovqbmem(mem_addr as *mut i8, a.as_i64x8(), k);
+    vpmovqbmem(mem_addr, a.as_i64x8(), k);
 }
 
 /// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -30001,7 +30061,7 @@ pub unsafe fn _mm512_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovqb))]
 pub unsafe fn _mm256_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovqbmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+    vpmovqbmem256(mem_addr, a.as_i64x4(), k);
 }
 
 /// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -30011,7 +30071,7 @@ pub unsafe fn _mm256_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovqb))]
 pub unsafe fn _mm_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovqbmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+    vpmovqbmem128(mem_addr, a.as_i64x2(), k);
 }
 
 /// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -30021,7 +30081,7 @@ pub unsafe fn _mm_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: _
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpmovsqb))]
 pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
-    vpmovsqbmem(mem_addr as *mut i8, a.as_i64x8(), k);
+    vpmovsqbmem(mem_addr, a.as_i64x8(), k);
 }
 
 /// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -30031,7 +30091,7 @@ pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovsqb))]
 pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovsqbmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+    vpmovsqbmem256(mem_addr, a.as_i64x4(), k);
 }
 
 /// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -30041,7 +30101,7 @@ pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovsqb))]
 pub unsafe fn _mm_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovsqbmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+    vpmovsqbmem128(mem_addr, a.as_i64x2(), k);
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -30051,7 +30111,7 @@ pub unsafe fn _mm_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a:
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpmovusqb))]
 pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
-    vpmovusqbmem(mem_addr as *mut i8, a.as_i64x8(), k);
+    vpmovusqbmem(mem_addr, a.as_i64x8(), k);
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -30061,7 +30121,7 @@ pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovusqb))]
 pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovusqbmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+    vpmovusqbmem256(mem_addr, a.as_i64x4(), k);
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -30071,7 +30131,7 @@ pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovusqb))]
 pub unsafe fn _mm_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovusqbmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+    vpmovusqbmem128(mem_addr, a.as_i64x2(), k);
 }
 
 ///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -30081,7 +30141,7 @@ pub unsafe fn _mm_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a:
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpmovqd))]
 pub unsafe fn _mm512_mask_cvtepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
-    vpmovqdmem(mem_addr as *mut i8, a.as_i64x8(), k);
+    vpmovqdmem(mem_addr, a.as_i64x8(), k);
 }
 
 ///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -30091,7 +30151,7 @@ pub unsafe fn _mm512_mask_cvtepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovqd))]
 pub unsafe fn _mm256_mask_cvtepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovqdmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+    vpmovqdmem256(mem_addr, a.as_i64x4(), k);
 }
 
 ///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -30101,7 +30161,7 @@ pub unsafe fn _mm256_mask_cvtepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovqd))]
 pub unsafe fn _mm_mask_cvtepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovqdmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+    vpmovqdmem128(mem_addr, a.as_i64x2(), k);
 }
 
 /// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -30111,7 +30171,7 @@ pub unsafe fn _mm_mask_cvtepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a:
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpmovsqd))]
 pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
-    vpmovsqdmem(mem_addr as *mut i8, a.as_i64x8(), k);
+    vpmovsqdmem(mem_addr, a.as_i64x8(), k);
 }
 
 /// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -30121,7 +30181,7 @@ pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovsqd))]
 pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovsqdmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+    vpmovsqdmem256(mem_addr, a.as_i64x4(), k);
 }
 
 /// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -30131,7 +30191,7 @@ pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8,
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovsqd))]
 pub unsafe fn _mm_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovsqdmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+    vpmovsqdmem128(mem_addr, a.as_i64x2(), k);
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -30141,7 +30201,7 @@ pub unsafe fn _mm_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a:
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpmovusqd))]
 pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
-    vpmovusqdmem(mem_addr as *mut i8, a.as_i64x8(), k);
+    vpmovusqdmem(mem_addr, a.as_i64x8(), k);
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -30151,7 +30211,7 @@ pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovusqd))]
 pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
-    vpmovusqdmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+    vpmovusqdmem256(mem_addr, a.as_i64x4(), k);
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -30161,7 +30221,7 @@ pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovusqd))]
 pub unsafe fn _mm_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
-    vpmovusqdmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+    vpmovusqdmem128(mem_addr, a.as_i64x2(), k);
 }
 
 /// Store 512-bits (composed of 16 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
@@ -38449,38 +38509,6 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.psrlv.q.512"]
     fn vpsrlvq(a: i64x8, b: i64x8) -> i64x8;
 
-    #[link_name = "llvm.x86.avx512.pslli.d.512"]
-    fn vpsllid(a: i32x16, imm8: u32) -> i32x16;
-
-    #[link_name = "llvm.x86.avx2.pslli.d"]
-    fn psllid256(a: i32x8, imm8: i32) -> i32x8;
-    #[link_name = "llvm.x86.sse2.pslli.d"]
-    fn psllid128(a: i32x4, imm8: i32) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.psrli.d.512"]
-    fn vpsrlid(a: i32x16, imm8: u32) -> i32x16;
-
-    #[link_name = "llvm.x86.avx2.psrli.d"]
-    fn psrlid256(a: i32x8, imm8: i32) -> i32x8;
-    #[link_name = "llvm.x86.sse2.psrli.d"]
-    fn psrlid128(a: i32x4, imm8: i32) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.pslli.q.512"]
-    fn vpslliq(a: i64x8, imm8: u32) -> i64x8;
-
-    #[link_name = "llvm.x86.avx2.pslli.q"]
-    fn pslliq256(a: i64x4, imm8: i32) -> i64x4;
-    #[link_name = "llvm.x86.sse2.pslli.q"]
-    fn pslliq128(a: i64x2, imm8: i32) -> i64x2;
-
-    #[link_name = "llvm.x86.avx512.psrli.q.512"]
-    fn vpsrliq(a: i64x8, imm8: u32) -> i64x8;
-
-    #[link_name = "llvm.x86.avx2.psrli.q"]
-    fn psrliq256(a: i64x4, imm8: i32) -> i64x4;
-    #[link_name = "llvm.x86.sse2.psrli.q"]
-    fn psrliq128(a: i64x2, imm8: i32) -> i64x2;
-
     #[link_name = "llvm.x86.avx512.psll.d.512"]
     fn vpslld(a: i32x16, count: i32x4) -> i32x16;
     #[link_name = "llvm.x86.avx512.psrl.d.512"]
@@ -38500,20 +38528,6 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.psra.q.128"]
     fn vpsraq128(a: i64x2, count: i64x2) -> i64x2;
 
-    #[link_name = "llvm.x86.avx512.psrai.d.512"]
-    fn vpsraid512(a: i32x16, imm8: u32) -> i32x16;
-    #[link_name = "llvm.x86.avx2.psrai.d"]
-    fn psraid256(a: i32x8, imm8: i32) -> i32x8;
-    #[link_name = "llvm.x86.sse2.psrai.d"]
-    fn psraid128(a: i32x4, imm8: i32) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.psrai.q.512"]
-    fn vpsraiq(a: i64x8, imm8: u32) -> i64x8;
-    #[link_name = "llvm.x86.avx512.psrai.q.256"]
-    fn vpsraiq256(a: i64x4, imm8: u32) -> i64x4;
-    #[link_name = "llvm.x86.avx512.psrai.q.128"]
-    fn vpsraiq128(a: i64x2, imm8: u32) -> i64x2;
-
     #[link_name = "llvm.x86.avx512.psrav.d.512"]
     fn vpsravd(a: i32x16, count: i32x16) -> i32x16;
 
diff --git a/library/stdarch/crates/core_arch/src/x86/mod.rs b/library/stdarch/crates/core_arch/src/x86/mod.rs
index ee8b7e75d..c5e457ae7 100644
--- a/library/stdarch/crates/core_arch/src/x86/mod.rs
+++ b/library/stdarch/crates/core_arch/src/x86/mod.rs
@@ -300,14 +300,14 @@ types! {
     #[stable(feature = "simd_avx512_types", since = "CURRENT_RUSTC_VERSION")]
     pub struct __m512d(f64, f64, f64, f64, f64, f64, f64, f64);
 
-    /// 128-bit wide set of eight 'u16' types, x86-specific
+    /// 128-bit wide set of eight `u16` types, x86-specific
     ///
     /// This type is representing a 128-bit SIMD register which internally is consisted of
     /// eight packed `u16` instances. Its purpose is for bf16 related intrinsic
     /// implementations.
     pub struct __m128bh(u16, u16, u16, u16, u16, u16, u16, u16);
 
-    /// 256-bit wide set of 16 'u16' types, x86-specific
+    /// 256-bit wide set of 16 `u16` types, x86-specific
     ///
     /// This type is the same as the `__m256bh` type defined by Intel,
     /// representing a 256-bit SIMD register which internally is consisted of
@@ -318,7 +318,7 @@ types! {
         u16, u16, u16, u16, u16, u16, u16, u16
     );
 
-    /// 512-bit wide set of 32 'u16' types, x86-specific
+    /// 512-bit wide set of 32 `u16` types, x86-specific
     ///
     /// This type is the same as the `__m512bh` type defined by Intel,
     /// representing a 512-bit SIMD register which internally is consisted of
diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs
index f4fdb5046..3d572a1f5 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs
@@ -501,7 +501,11 @@ pub unsafe fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(pslliw(a.as_i16x8(), IMM8))
+    if IMM8 >= 16 {
+        _mm_setzero_si128()
+    } else {
+        transmute(simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
+    }
 }
 
 /// Shifts packed 16-bit integers in `a` left by `count` while shifting in
@@ -526,7 +530,11 @@ pub unsafe fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(psllid(a.as_i32x4(), IMM8))
+    if IMM8 >= 32 {
+        _mm_setzero_si128()
+    } else {
+        transmute(simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
+    }
 }
 
 /// Shifts packed 32-bit integers in `a` left by `count` while shifting in
@@ -551,7 +559,11 @@ pub unsafe fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(pslliq(a.as_i64x2(), IMM8))
+    if IMM8 >= 64 {
+        _mm_setzero_si128()
+    } else {
+        transmute(simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
+    }
 }
 
 /// Shifts packed 64-bit integers in `a` left by `count` while shifting in
@@ -577,7 +589,7 @@ pub unsafe fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(psraiw(a.as_i16x8(), IMM8))
+    transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16)))
 }
 
 /// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
@@ -603,7 +615,7 @@ pub unsafe fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(psraid(a.as_i32x4(), IMM8))
+    transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31))))
 }
 
 /// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
@@ -680,7 +692,11 @@ unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(psrliw(a.as_i16x8(), IMM8))
+    if IMM8 >= 16 {
+        _mm_setzero_si128()
+    } else {
+        transmute(simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
+    }
 }
 
 /// Shifts packed 16-bit integers in `a` right by `count` while shifting in
@@ -706,7 +722,11 @@ pub unsafe fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(psrlid(a.as_i32x4(), IMM8))
+    if IMM8 >= 32 {
+        _mm_setzero_si128()
+    } else {
+        transmute(simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
+    }
 }
 
 /// Shifts packed 32-bit integers in `a` right by `count` while shifting in
@@ -732,7 +752,11 @@ pub unsafe fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(psrliq(a.as_i64x2(), IMM8))
+    if IMM8 >= 64 {
+        _mm_setzero_si128()
+    } else {
+        transmute(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
+    }
 }
 
 /// Shifts packed 64-bit integers in `a` right by `count` while shifting in
@@ -1248,7 +1272,7 @@ pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
 #[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
-    storeudq(mem_addr as *mut i8, a);
+    mem_addr.write_unaligned(a);
 }
 
 /// Stores the lower 64-bit integer `a` to a memory location.
@@ -2515,7 +2539,7 @@ pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
 #[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
-    storeupd(mem_addr as *mut i8, a);
+    mem_addr.cast::<__m128d>().write_unaligned(a);
 }
 
 /// Stores the lower double-precision (64-bit) floating-point element from `a`
@@ -2816,36 +2840,20 @@ extern "C" {
     fn pmuludq(a: u32x4, b: u32x4) -> u64x2;
     #[link_name = "llvm.x86.sse2.psad.bw"]
     fn psadbw(a: u8x16, b: u8x16) -> u64x2;
-    #[link_name = "llvm.x86.sse2.pslli.w"]
-    fn pslliw(a: i16x8, imm8: i32) -> i16x8;
     #[link_name = "llvm.x86.sse2.psll.w"]
     fn psllw(a: i16x8, count: i16x8) -> i16x8;
-    #[link_name = "llvm.x86.sse2.pslli.d"]
-    fn psllid(a: i32x4, imm8: i32) -> i32x4;
     #[link_name = "llvm.x86.sse2.psll.d"]
     fn pslld(a: i32x4, count: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.sse2.pslli.q"]
-    fn pslliq(a: i64x2, imm8: i32) -> i64x2;
     #[link_name = "llvm.x86.sse2.psll.q"]
     fn psllq(a: i64x2, count: i64x2) -> i64x2;
-    #[link_name = "llvm.x86.sse2.psrai.w"]
-    fn psraiw(a: i16x8, imm8: i32) -> i16x8;
     #[link_name = "llvm.x86.sse2.psra.w"]
     fn psraw(a: i16x8, count: i16x8) -> i16x8;
-    #[link_name = "llvm.x86.sse2.psrai.d"]
-    fn psraid(a: i32x4, imm8: i32) -> i32x4;
     #[link_name = "llvm.x86.sse2.psra.d"]
     fn psrad(a: i32x4, count: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.sse2.psrli.w"]
-    fn psrliw(a: i16x8, imm8: i32) -> i16x8;
     #[link_name = "llvm.x86.sse2.psrl.w"]
     fn psrlw(a: i16x8, count: i16x8) -> i16x8;
-    #[link_name = "llvm.x86.sse2.psrli.d"]
-    fn psrlid(a: i32x4, imm8: i32) -> i32x4;
     #[link_name = "llvm.x86.sse2.psrl.d"]
     fn psrld(a: i32x4, count: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.sse2.psrli.q"]
-    fn psrliq(a: i64x2, imm8: i32) -> i64x2;
     #[link_name = "llvm.x86.sse2.psrl.q"]
     fn psrlq(a: i64x2, count: i64x2) -> i64x2;
     #[link_name = "llvm.x86.sse2.cvtdq2ps"]
@@ -2920,10 +2928,6 @@ extern "C" {
     fn cvttsd2si(a: __m128d) -> i32;
     #[link_name = "llvm.x86.sse2.cvttps2dq"]
     fn cvttps2dq(a: __m128) -> i32x4;
-    #[link_name = "llvm.x86.sse2.storeu.dq"]
-    fn storeudq(mem_addr: *mut i8, a: __m128i);
-    #[link_name = "llvm.x86.sse2.storeu.pd"]
-    fn storeupd(mem_addr: *mut i8, a: __m128d);
 }
 
 #[cfg(test)]
diff --git a/library/stdarch/crates/intrinsic-test/missing_aarch64.txt b/library/stdarch/crates/intrinsic-test/missing_aarch64.txt
index b09d677af..33b7425d7 100644
--- a/library/stdarch/crates/intrinsic-test/missing_aarch64.txt
+++ b/library/stdarch/crates/intrinsic-test/missing_aarch64.txt
@@ -12,27 +12,18 @@ vbfmlaltq_f32
 vbfmlaltq_lane_f32
 vbfmlaltq_laneq_f32
 vbfmmlaq_f32
-vsudot_laneq_s32
-vsudot_lane_s32
-vsudotq_laneq_s32
-vsudotq_lane_s32
-vusdot_laneq_s32
-vusdot_lane_s32
-vusdotq_laneq_s32
-vusdotq_lane_s32
-vusdotq_s32
-vusdot_s32
 
 
-# Missing from both Clang and stdarch
-vrnd32x_f64
+# Implemented in stdarch, but missing in Clang.
 vrnd32xq_f64
-vrnd32z_f64
 vrnd32zq_f64
-vrnd64x_f64
 vrnd64xq_f64
-vrnd64z_f64
 vrnd64zq_f64
+# LLVM select error, and missing in Clang.
+vrnd32x_f64
+vrnd32z_f64
+vrnd64x_f64
+vrnd64z_f64
 
 # LLVM select error in debug builds
 #vqshlu_n_s16
diff --git a/library/stdarch/crates/intrinsic-test/missing_arm.txt b/library/stdarch/crates/intrinsic-test/missing_arm.txt
index 3acc61678..7439cd6e6 100644
--- a/library/stdarch/crates/intrinsic-test/missing_arm.txt
+++ b/library/stdarch/crates/intrinsic-test/missing_arm.txt
@@ -12,16 +12,6 @@ vbfmlaltq_f32
 vbfmlaltq_lane_f32
 vbfmlaltq_laneq_f32
 vbfmmlaq_f32
-vsudot_laneq_s32
-vsudot_lane_s32
-vsudotq_laneq_s32
-vsudotq_lane_s32
-vusdot_laneq_s32
-vusdot_lane_s32
-vusdotq_laneq_s32
-vusdotq_lane_s32
-vusdotq_s32
-vusdot_s32
 
 # Implemented in Clang and stdarch for A64 only even though CSV claims A32 support
 __crc32d
@@ -170,14 +160,6 @@ vcvtpq_s32_f32
 vcvtpq_u32_f32
 vcvtp_s32_f32
 vcvtp_u32_f32
-vdot_lane_s32
-vdot_lane_u32
-vdotq_lane_s32
-vdotq_lane_u32
-vdotq_s32
-vdotq_u32
-vdot_s32
-vdot_u32
 vqdmulh_lane_s16
 vqdmulh_lane_s32
 vqdmulhq_lane_s16
diff --git a/library/stdarch/crates/intrinsic-test/src/argument.rs b/library/stdarch/crates/intrinsic-test/src/argument.rs
index c2f9f9450..dd930115b 100644
--- a/library/stdarch/crates/intrinsic-test/src/argument.rs
+++ b/library/stdarch/crates/intrinsic-test/src/argument.rs
@@ -173,8 +173,8 @@ impl ArgumentList {
             .join("\n")
     }
 
-    /// Creates a line for each argument that initalizes the argument from an array [arg]_vals at
-    /// an offset i using a load intrinsic, in C.
+    /// Creates a line for each argument that initializes the argument from an array `[arg]_vals` at
+    /// an offset `i` using a load intrinsic, in C.
     /// e.g `uint8x8_t a = vld1_u8(&a_vals[i]);`
     pub fn load_values_c(&self, p64_armv7_workaround: bool) -> String {
         self.iter()
@@ -214,8 +214,8 @@ impl ArgumentList {
             .join("\n        ")
     }
 
-    /// Creates a line for each argument that initalizes the argument from array [ARG]_VALS at
-    /// an offset i using a load intrinsic, in Rust.
+    /// Creates a line for each argument that initializes the argument from array `[ARG]_VALS` at
+    /// an offset `i` using a load intrinsic, in Rust.
     /// e.g `let a = vld1_u8(A_VALS.as_ptr().offset(i));`
     pub fn load_values_rust(&self) -> String {
         self.iter()
diff --git a/library/stdarch/crates/std_detect/src/detect/arch/arm.rs b/library/stdarch/crates/std_detect/src/detect/arch/arm.rs
index a7dea27fb..fd332e0b2 100644
--- a/library/stdarch/crates/std_detect/src/detect/arch/arm.rs
+++ b/library/stdarch/crates/std_detect/src/detect/arch/arm.rs
@@ -22,5 +22,7 @@ features! {
     @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] sha2: "sha2";
     /// FEAT_SHA1 & FEAT_SHA256 (SHA1 & SHA2-256 instructions)
     @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] i8mm: "i8mm";
-    /// FEAT_I8MM
+    /// FEAT_I8MM (integer matrix multiplication, plus ASIMD support)
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] dotprod: "dotprod";
+    /// FEAT_DotProd (Vector Dot-Product - ASIMDDP)
 }
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/arm.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/arm.rs
index 7601cf0a8..4dc9590e1 100644
--- a/library/stdarch/crates/std_detect/src/detect/os/linux/arm.rs
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/arm.rs
@@ -17,6 +17,8 @@ pub(crate) fn detect_features() -> cache::Initializer {
     //
     // [hwcap]: https://github.com/torvalds/linux/blob/master/arch/arm/include/uapi/asm/hwcap.h
     if let Ok(auxv) = auxvec::auxv() {
+        enable_feature(&mut value, Feature::i8mm, bit::test(auxv.hwcap, 27));
+        enable_feature(&mut value, Feature::dotprod, bit::test(auxv.hwcap, 24));
         enable_feature(&mut value, Feature::neon, bit::test(auxv.hwcap, 12));
         enable_feature(&mut value, Feature::pmull, bit::test(auxv.hwcap2, 1));
         enable_feature(&mut value, Feature::crc, bit::test(auxv.hwcap2, 4));
@@ -37,6 +39,12 @@ pub(crate) fn detect_features() -> cache::Initializer {
             Feature::neon,
             c.field("Features").has("neon") && !has_broken_neon(&c),
         );
+        enable_feature(&mut value, Feature::i8mm, c.field("Features").has("i8mm"));
+        enable_feature(
+            &mut value,
+            Feature::dotprod,
+            c.field("Features").has("asimddp"),
+        );
         enable_feature(&mut value, Feature::pmull, c.field("Features").has("pmull"));
         enable_feature(&mut value, Feature::crc, c.field("Features").has("crc32"));
         enable_feature(&mut value, Feature::aes, c.field("Features").has("aes"));
diff --git a/library/stdarch/crates/std_detect/src/lib.rs b/library/stdarch/crates/std_detect/src/lib.rs
index c0819218c..7fdfb872e 100644
--- a/library/stdarch/crates/std_detect/src/lib.rs
+++ b/library/stdarch/crates/std_detect/src/lib.rs
@@ -19,8 +19,6 @@
 #![deny(clippy::missing_inline_in_public_items)]
 #![cfg_attr(test, allow(unused_imports))]
 #![no_std]
-// FIXME(Nilstrieb): Remove this once the compiler in stdarch CI has the internal_features lint.
-#![allow(unknown_lints)]
 #![allow(internal_features)]
 
 #[cfg(test)]
diff --git a/library/stdarch/crates/std_detect/tests/cpu-detection.rs b/library/stdarch/crates/std_detect/tests/cpu-detection.rs
index 38bdb5bbd..f93212d24 100644
--- a/library/stdarch/crates/std_detect/tests/cpu-detection.rs
+++ b/library/stdarch/crates/std_detect/tests/cpu-detection.rs
@@ -20,16 +20,25 @@ fn all() {
 }
 
 #[test]
-#[cfg(all(
-    target_arch = "arm",
-    any(target_os = "linux", target_os = "android", target_os = "freebsd"),
-))]
-fn arm_linux_or_freebsd() {
+#[cfg(all(target_arch = "arm", any(target_os = "freebsd"),))]
+fn arm_freebsd() {
+    println!("neon: {}", is_arm_feature_detected!("neon"));
+    println!("pmull: {}", is_arm_feature_detected!("pmull"));
+    println!("crc: {}", is_arm_feature_detected!("crc"));
+    println!("aes: {}", is_arm_feature_detected!("aes"));
+    println!("sha2: {}", is_arm_feature_detected!("sha2"));
+}
+
+#[test]
+#[cfg(all(target_arch = "arm", any(target_os = "linux", target_os = "android"),))]
+fn arm_linux() {
     println!("neon: {}", is_arm_feature_detected!("neon"));
     println!("pmull: {}", is_arm_feature_detected!("pmull"));
     println!("crc: {}", is_arm_feature_detected!("crc"));
     println!("aes: {}", is_arm_feature_detected!("aes"));
     println!("sha2: {}", is_arm_feature_detected!("sha2"));
+    println!("dotprod: {}", is_arm_feature_detected!("dotprod"));
+    println!("i8mm: {}", is_arm_feature_detected!("i8mm"));
 }
 
 #[test]
diff --git a/library/stdarch/crates/stdarch-gen/neon.spec b/library/stdarch/crates/stdarch-gen/neon.spec
index 06090e669..760fa2204 100644
--- a/library/stdarch/crates/stdarch-gen/neon.spec
+++ b/library/stdarch/crates/stdarch-gen/neon.spec
@@ -3478,27 +3478,138 @@ link-arm = vst4lane._EXTpi8r_
 const-arm = LANE
 generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void
 
+/// Dot product vector form with unsigned and signed integers
+name = vusdot
+out-suffix
+a = 1000, -4200, -1000, 2000
+b = 100, 205, 110, 195, 120, 185, 130, 175, 140, 165, 150, 155, 160, 145, 170, 135
+c = 0, 1, 2, 3, -1, -2, -3, -4, 4, 5, 6, 7, -5, -6, -7, -8
+aarch64 = usdot
+arm = vusdot
+target = i8mm
+
+//  1000 + (100, 205, 110, 195) . ( 0,  1,  2,  3)
+// -4200 + (120, 185, 130, 175) . (-1, -2, -3, -4)
+//  ...
+validate 2010, -5780, 2370, -1940
+
+link-arm = usdot._EXT2_._EXT4_:int32x2_t:uint8x8_t:int8x8_t:int32x2_t
+link-aarch64 = usdot._EXT2_._EXT4_:int32x2_t:uint8x8_t:int8x8_t:int32x2_t
+generate int32x2_t:uint8x8_t:int8x8_t:int32x2_t
+
+link-arm = usdot._EXT2_._EXT4_:int32x4_t:uint8x16_t:int8x16_t:int32x4_t
+link-aarch64 = usdot._EXT2_._EXT4_:int32x4_t:uint8x16_t:int8x16_t:int32x4_t
+generate int32x4_t:uint8x16_t:int8x16_t:int32x4_t
+
+/// Dot product index form with unsigned and signed integers
+name = vusdot
+out-lane-suffixes
+constn = LANE
+aarch64 = usdot
+arm = vusdot
+target = i8mm
+multi_fn = static_assert_imm-in2_dot-LANE
+multi_fn = transmute, c:merge4_t2, c
+multi_fn = simd_shuffle!, c:out_signed, c, c, {dup-out_len-LANE as u32}
+multi_fn = vusdot-out-noext, a, b, {transmute, c}
+a = 1000, -4200, -1000, 2000
+b = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250
+c = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11
+
+//  1000 + (100, 110, 120, 130) . (4, 3, 2, 1)
+// -4200 + (140, 150, 160, 170) . (4, 3, 2, 1)
+//  ...
+n = 0
+validate 2100, -2700, 900, 4300
+
+//  1000 + (100, 110, 120, 130) . (0, -1, -2, -3)
+// -4200 + (140, 150, 160, 170) . (0, -1, -2, -3)
+//  ...
+n = 1
+validate 260, -5180, -2220, 540
+
+generate int32x2_t:uint8x8_t:int8x8_t:int32x2_t
+generate int32x4_t:uint8x16_t:int8x8_t:int32x4_t
+
+/// Dot product index form with unsigned and signed integers
+name = vusdot
+out-lane-suffixes
+constn = LANE
+// Only AArch64 has the laneq forms.
+aarch64 = usdot
+target = i8mm
+multi_fn = static_assert_imm-in2_dot-LANE
+multi_fn = transmute, c:merge4_t2, c
+multi_fn = simd_shuffle!, c:out_signed, c, c, {dup-out_len-LANE as u32}
+multi_fn = vusdot-out-noext, a, b, {transmute, c}
+a = 1000, -4200, -1000, 2000
+b = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250
+c = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11
+
+//  1000 + (100, 110, 120, 130) . (-4, -5, -6, -7)
+// -4200 + (140, 150, 160, 170) . (-4, -5, -6, -7)
+//  ...
+n = 3
+validate -3420, -10140, -8460, -6980
+
+generate int32x2_t:uint8x8_t:int8x16_t:int32x2_t
+generate int32x4_t:uint8x16_t:int8x16_t:int32x4_t
+
 /// Dot product index form with signed and unsigned integers
 name = vsudot
 out-lane-suffixes
 constn = LANE
+aarch64 = sudot
+arm = vsudot
+target = i8mm
+
 multi_fn = static_assert_imm-in2_dot-LANE
-multi_fn = simd_shuffle!, c:unsigned, c, c, {base-4-LANE}
-multi_fn = vsudot-outlane-_, a, b, c
-a = 1, 2, 1, 2
-b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
-c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+multi_fn = transmute, c:merge4_t2, c
+multi_fn = simd_shuffle!, c:out_unsigned, c, c, {dup-out_len-LANE as u32}
+multi_fn = vusdot-out-noext, a, {transmute, c}, b
+a = -2000, 4200, -1000, 2000
+b = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11
+c = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250
+
+// -2000 + (4,  3,  2,  1) . (100, 110, 120, 130)
+//  4200 + (0, -1, -2, -3) . (100, 110, 120, 130)
+//  ...
 n = 0
-validate 31, 72, 31, 72
-target = dotprod
+validate -900, 3460, -3580, -2420
+
+// -2000 + (4,  3,  2,  1) . (140, 150, 160, 170)
+//  4200 + (0, -1, -2, -3) . (140, 150, 160, 170)
+//  ...
+n = 1
+validate -500, 3220, -4460, -3940
 
+generate int32x2_t:int8x8_t:uint8x8_t:int32x2_t
+generate int32x4_t:int8x16_t:uint8x8_t:int32x4_t
+
+/// Dot product index form with signed and unsigned integers
+name = vsudot
+out-lane-suffixes
+constn = LANE
+// Only AArch64 has the laneq forms.
 aarch64 = sudot
-link-aarch64 = usdot._EXT2_._EXT4_:int32x2_t:int8x8_t:uint8x8_t:int32x2_t
-// LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.neon.usdot
-//generate int32x2_t:int8x8_t:uint8x8_t:int32x2_t, int32x2_t:int8x8_t:uint8x16_t:int32x2_t
-link-aarch64 = usdot._EXT2_._EXT4_:int32x4_t:int8x16_t:uint8x16_t:int32x4_t
-// LLVM ERROR: Cannot select: intrinsic %llvm.aarch64.neon.usdot
-//generate int32x4_t:int8x16_t:uint8x8_t:int32x4_t, int32x4_t:int8x16_t:uint8x16_t:int32x4_t
+target = i8mm
+
+multi_fn = static_assert_imm-in2_dot-LANE
+multi_fn = transmute, c:merge4_t2, c
+multi_fn = simd_shuffle!, c:out_unsigned, c, c, {dup-out_len-LANE as u32}
+multi_fn = vusdot-out-noext, a, {transmute, c}, b
+a = -2000, 4200, -1000, 2000
+b = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11
+c = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250
+
+// -2000 + (4,  3,  2,  1) . (220, 230, 240, 250)
+//  4200 + (0, -1, -2, -3) . (220, 230, 240, 250)
+//  ...
+n = 3
+validate 300, 2740, -6220, -6980
+
+generate int32x2_t:int8x8_t:uint8x16_t:int32x2_t
+generate int32x4_t:int8x16_t:uint8x16_t:int32x4_t
 
 /// Multiply
 name = vmul
@@ -4612,7 +4723,7 @@ aarch64 = fcmla
 generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t
 generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
 
-/// Dot product arithmetic
+/// Dot product arithmetic (vector)
 name = vdot
 out-suffix
 a = 1, 2, 1, 2
@@ -4621,35 +4732,65 @@ c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
 validate 31, 176, 31, 176
 target = dotprod
 
+arm = vsdot
 aarch64 = sdot
+link-arm = sdot._EXT_._EXT3_
 link-aarch64 = sdot._EXT_._EXT3_
 generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t
 
+arm = vudot
 aarch64 = udot
+link-arm = udot._EXT_._EXT3_
 link-aarch64 = udot._EXT_._EXT3_
 generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t
 
-/// Dot product arithmetic
+/// Dot product arithmetic (indexed)
 name = vdot
 out-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_dot-LANE
-multi_fn = simd_shuffle!, c:in_t, c, c, {base-4-LANE}
-multi_fn = vdot-out-noext, a, b, c
+multi_fn = transmute, c:merge4_t2, c
+multi_fn = simd_shuffle!, c:out_t, c, c, {dup-out_len-LANE as u32}
+multi_fn = vdot-out-noext, a, b, {transmute, c}
 a = 1, 2, 1, 2
-b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+b = -1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
 c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
 n = 0
-validate 31, 72, 31, 72
+validate 29, 72, 31, 72
 target = dotprod
 
+// Only AArch64 has the laneq forms.
 aarch64 = sdot
-generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x2_t:int8x8_t:int8x16_t:int32x2_t
-generate int32x4_t:int8x16_t:int8x8_t:int32x4_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t
+generate int32x2_t:int8x8_t:int8x16_t:int32x2_t
+generate int32x4_t:int8x16_t:int8x16_t:int32x4_t
+
+arm = vsdot
+generate int32x2_t:int8x8_t:int8x8_t:int32x2_t
+generate int32x4_t:int8x16_t:int8x8_t:int32x4_t
+
+/// Dot product arithmetic (indexed)
+name = vdot
+out-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_dot-LANE
+multi_fn = transmute, c:merge4_t2, c
+multi_fn = simd_shuffle!, c:out_t, c, c, {dup-out_len-LANE as u32}
+multi_fn = vdot-out-noext, a, b, {transmute, c}
+a = 1, 2, 1, 2
+b = 255, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8
+n = 0
+validate 285, 72, 31, 72
+target = dotprod
 
+// Only AArch64 has the laneq forms.
 aarch64 = udot
-generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x2_t:uint8x8_t:uint8x16_t:uint32x2_t
-generate uint32x4_t:uint8x16_t:uint8x8_t:uint32x4_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t
+generate uint32x2_t:uint8x8_t:uint8x16_t:uint32x2_t
+generate uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t
+
+arm = vudot
+generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t
+generate uint32x4_t:uint8x16_t:uint8x8_t:uint32x4_t
 
 /// Maximum (vector)
 name = vmax
@@ -6511,7 +6652,7 @@ name = vrshr
 n-suffix
 constn = N
 multi_fn = static_assert-N-1-bits
-multi_fn = vrshl-self-noext, a, {vdup-nself-noext, (-N) as _}
+multi_fn = vrshl-self-noext, a, {vdup-nself-noext, -N as _}
 a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
 n = 2
 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
@@ -6538,7 +6679,7 @@ name = vrshr
 n-suffix
 constn = N
 multi_fn = static_assert-N-1-bits
-multi_fn = vrshl-self-noext, a, {vdup-nsigned-noext, (-N) as _}
+multi_fn = vrshl-self-noext, a, {vdup-nsigned-noext, -N as _}
 a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
 n = 2
 validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
@@ -6650,10 +6791,10 @@ b = 4
 n = 2
 validate 2
 
-aarch64 = srsra
+aarch64 = srshr
 generate i64
 
-/// Ungisned rounding shift right and accumulate.
+/// Unsigned rounding shift right and accumulate.
 name = vrsra
 n-suffix
 constn = N
@@ -6665,7 +6806,7 @@ b = 4
 n = 2
 validate 2
 
-aarch64 = ursra
+aarch64 = urshr
 generate u64
 
 /// Rounding subtract returning high narrow
@@ -7071,44 +7212,170 @@ generate uint64x2_t
 
 /// Floating-point round to 32-bit integer, using current rounding mode
 name = vrnd32x
-a = 1.1, 1.9, -1.7, -2.3
-validate 1.0, 2.0, -2.0, -2.0
 target = frintts
 
+// For validation, the rounding mode should be the default: round-to-nearest (ties-to-even).
+a = -1.5, 2.9, 1.5, -2.5
+validate -2.0, 3.0, 2.0, -2.0
+
 aarch64 = frint32x
 link-aarch64 = frint32x._EXT_
 generate float32x2_t, float32x4_t
 
+// The float64x1_t form uses a different LLVM link and isn't supported by Clang
+// (and so has no intrinsic-test), so perform extra validation to make sure
+// that it matches the float64x2_t form.
+
+a = 1.5, -2.5
+validate 2.0, -2.0
+// - The biggest f64 that rounds to i32::MAX.
+// - The smallest positive f64 that rounds out of range.
+a = 2147483647.499999762, 2147483647.5
+validate 2147483647.0, -2147483648.0
+// - The smallest f64 that rounds to i32::MIN + 1.
+// - The largest negative f64 that rounds out of range.
+a = -2147483647.499999762, -2147483648.500000477
+validate -2147483647.0, -2147483648.0
+generate float64x2_t
+
+// Odd-numbered tests for float64x1_t coverage.
+a = 2.9
+validate 3.0
+a = -2.5
+validate -2.0
+a = 2147483647.5
+validate -2147483648.0
+a = -2147483648.500000477
+validate -2147483648.0
+
+multi_fn = transmute, {self-out-_, {simd_extract, a, 0}}
+link-aarch64 = llvm.aarch64.frint32x.f64:f64:::f64
+generate float64x1_t
+
 /// Floating-point round to 32-bit integer toward zero
 name = vrnd32z
-a = 1.1, 1.9, -1.7, -2.3
-validate 1.0, 1.0, -1.0, -2.0
 target = frintts
 
+a = -1.5, 2.9, 1.5, -2.5
+validate -1.0, 2.0, 1.0, -2.0
+
 aarch64 = frint32z
 link-aarch64 = frint32z._EXT_
 generate float32x2_t, float32x4_t
 
+// The float64x1_t form uses a different LLVM link and isn't supported by Clang
+// (and so has no intrinsic-test), so perform extra validation to make sure
+// that it matches the float64x2_t form.
+
+a = 1.5, -2.5
+validate 1.0, -2.0
+// - The biggest f64 that rounds to i32::MAX.
+// - The smallest positive f64 that rounds out of range.
+a = 2147483647.999999762, 2147483648.0
+validate 2147483647.0, -2147483648.0
+// - The smallest f64 that rounds to i32::MIN + 1.
+// - The largest negative f64 that rounds out of range.
+a = -2147483647.999999762, -2147483649.0
+validate -2147483647.0, -2147483648.0
+generate float64x2_t
+
+// Odd-numbered tests for float64x1_t coverage.
+a = 2.9
+validate 2.0
+a = -2.5
+validate -2.0
+a = 2147483648.0
+validate -2147483648.0
+a = -2147483649.0
+validate -2147483648.0
+
+multi_fn = transmute, {self-out-_, {simd_extract, a, 0}}
+link-aarch64 = llvm.aarch64.frint32z.f64:f64:::f64
+generate float64x1_t
+
 /// Floating-point round to 64-bit integer, using current rounding mode
 name = vrnd64x
-a = 1.1, 1.9, -1.7, -2.3
-validate 1.0, 2.0, -2.0, -2.0
 target = frintts
 
+// For validation, the rounding mode should be the default: round-to-nearest (ties-to-even).
+a = -1.5, 2.9, 1.5, -2.5
+validate -2.0, 3.0, 2.0, -2.0
+
 aarch64 = frint64x
 link-aarch64 = frint64x._EXT_
 generate float32x2_t, float32x4_t
 
+// The float64x1_t form uses a different LLVM link and isn't supported by Clang
+// (and so has no intrinsic-test), so perform extra validation to make sure
+// that it matches the float64x2_t form.
+
+a = 1.5, -2.5
+validate 2.0, -2.0
+// - The biggest f64 representable as an i64 (0x7ffffffffffffc00).
+// - The smallest positive f64 that is out of range (2^63).
+a = 9223372036854774784.0, 9223372036854775808.0
+validate 9223372036854774784.0, -9223372036854775808.0
+// - The smallest f64 representable as an i64 (i64::MIN).
+// - The biggest negative f64 that is out of range.
+a = -9223372036854775808.0, -9223372036854777856.0
+validate -9223372036854775808.0, -9223372036854775808.0
+generate float64x2_t
+
+// Odd-numbered tests for float64x1_t coverage.
+a = 2.9
+validate 3.0
+a = -2.5
+validate -2.0
+a = 9223372036854775808.0
+validate -9223372036854775808.0
+a = -9223372036854777856.0
+validate -9223372036854775808.0
+
+multi_fn = transmute, {self-out-_, {simd_extract, a, 0}}
+link-aarch64 = llvm.aarch64.frint64x.f64:f64:::f64
+generate float64x1_t
+
 /// Floating-point round to 64-bit integer toward zero
 name = vrnd64z
-a = 1.1, 1.9, -1.7, -2.3
-validate 1.0, 1.0, -1.0, -2.0
 target = frintts
 
+a = -1.5, 2.9, 1.5, -2.5
+validate -1.0, 2.0, 1.0, -2.0
+
 aarch64 = frint64z
 link-aarch64 = frint64z._EXT_
 generate float32x2_t, float32x4_t
 
+// The float64x1_t form uses a different LLVM link and isn't supported by Clang
+// (and so has no intrinsic-test), so perform extra validation to make sure
+// that it matches the float64x2_t form.
+
+a = 1.5, -2.5
+validate 1.0, -2.0
+// - The biggest f64 representable as an i64 (0x7ffffffffffffc00).
+// - The smallest positive f64 that is out of range (2^63).
+a = 9223372036854774784.0, 9223372036854775808.0
+validate 9223372036854774784.0, -9223372036854775808.0
+// - The smallest f64 representable as an i64 (i64::MIN).
+// - The biggest negative f64 that is out of range.
+a = -9223372036854775808.0, -9223372036854777856.0
+validate -9223372036854775808.0, -9223372036854775808.0
+generate float64x2_t
+
+// Odd-numbered tests for float64x1_t coverage.
+a = 2.9
+validate 2.0
+a = -2.5
+validate -2.0
+a = 9223372036854775808.0
+validate -9223372036854775808.0
+a = -9223372036854777856.0
+validate -9223372036854775808.0
+
+multi_fn = transmute, {self-out-_, {simd_extract, a, 0}}
+link-aarch64 = llvm.aarch64.frint64z.f64:f64:::f64
+generate float64x1_t
+
 /// Transpose elements
 name = vtrn
 multi_fn = simd_shuffle!, a1:in_t, a, b, {transpose-1-in_len}
@@ -7209,7 +7476,7 @@ generate uint8x8_t:uint8x8_t:uint8x8x2_t, uint16x4_t:uint16x4_t:uint16x4x2_t
 generate poly8x8_t:poly8x8_t:poly8x8x2_t, poly16x4_t:poly16x4_t:poly16x4x2_t
 arm = vtrn
 generate int32x2_t:int32x2_t:int32x2x2_t, uint32x2_t:uint32x2_t:uint32x2x2_t
-aarch64 = ext
+aarch64 = zip
 arm = vorr
 generate int8x16_t:int8x16_t:int8x16x2_t, int16x8_t:int16x8_t:int16x8x2_t, int32x4_t:int32x4_t:int32x4x2_t
 generate uint8x16_t:uint8x16_t:uint8x16x2_t, uint16x8_t:uint16x8_t:uint16x8x2_t, uint32x4_t:uint32x4_t:uint32x4x2_t
@@ -7227,7 +7494,7 @@ validate 1., 5., 2., 6., 3., 7., 4., 8.
 aarch64 = zip
 arm = vtrn
 generate float32x2_t:float32x2_t:float32x2x2_t
-aarch64 = ext
+aarch64 = zip
 arm = vorr
 generate float32x4_t:float32x4_t:float32x4x2_t
 
diff --git a/library/stdarch/crates/stdarch-gen/src/main.rs b/library/stdarch/crates/stdarch-gen/src/main.rs
index 652aee88c..8e2bea0e2 100644
--- a/library/stdarch/crates/stdarch-gen/src/main.rs
+++ b/library/stdarch/crates/stdarch-gen/src/main.rs
@@ -799,6 +799,19 @@ fn type_to_half(t: &str) -> &str {
     }
 }
 
+fn type_with_merged_lanes(t: &str, elements_per_lane: usize) -> String {
+    assert_eq!(type_len(t) % elements_per_lane, 0);
+    let prefix_len = t
+        .find(|c: char| c.is_ascii_digit())
+        .unwrap_or_else(|| t.len());
+    format!(
+        "{prefix}{bits}x{len}_t",
+        prefix = &t[0..prefix_len],
+        bits = type_bits(t) * elements_per_lane,
+        len = type_len(t) / elements_per_lane
+    )
+}
+
 fn asc(start: i32, len: usize) -> String {
     let mut s = String::from("[");
     for i in 0..len {
@@ -2515,7 +2528,7 @@ fn gen_arm(
 
 {function_doc}
 #[inline]
-#[cfg(target_arch = "aarch64")]{target_feature_aarch64}
+#[cfg(not(target_arch = "arm"))]{target_feature_aarch64}
 #[cfg_attr(test, assert_instr({assert_aarch64}{const_assert}))]{const_legacy}{stable_aarch64}
 {call_aarch64}
 "#,
@@ -2993,6 +3006,12 @@ fn get_call(
                 re = Some((re_params[0].clone(), in_t[1].to_string()));
             } else if re_params[1] == "out_t" {
                 re = Some((re_params[0].clone(), out_t.to_string()));
+            } else if re_params[1] == "out_unsigned" {
+                re = Some((re_params[0].clone(), type_to_unsigned(out_t).to_string()));
+            } else if re_params[1] == "out_signed" {
+                re = Some((re_params[0].clone(), type_to_signed(out_t).to_string()));
+            } else if re_params[1] == "merge4_t2" {
+                re = Some((re_params[0].clone(), type_with_merged_lanes(in_t[2], 4)));
             } else if re_params[1] == "half" {
                 re = Some((re_params[0].clone(), type_to_half(in_t[1]).to_string()));
             } else if re_params[1] == "in_ntt" {
diff --git a/library/stdarch/crates/stdarch-test/Cargo.toml b/library/stdarch/crates/stdarch-test/Cargo.toml
index ce5705c6e..3a2130d4e 100644
--- a/library/stdarch/crates/stdarch-test/Cargo.toml
+++ b/library/stdarch/crates/stdarch-test/Cargo.toml
@@ -7,11 +7,13 @@ edition = "2021"
 [dependencies]
 assert-instr-macro = { path = "../assert-instr-macro" }
 simd-test-macro = { path = "../simd-test-macro" }
-cc = "1.0"
 lazy_static = "1.0"
 rustc-demangle = "0.1.8"
 cfg-if = "1.0"
 
+[target.'cfg(windows)'.dependencies]
+cc = "1.0"
+
 # We use a crates.io dependency to disassemble wasm binaries to look for
 # instructions for `#[assert_instr]`. Note that we use an `=` dependency here
 # instead of a floating dependency because the text format for wasm changes over
diff --git a/library/stdarch/crates/stdarch-test/src/disassembly.rs b/library/stdarch/crates/stdarch-test/src/disassembly.rs
index 5d7a27e8a..54df7261e 100644
--- a/library/stdarch/crates/stdarch-test/src/disassembly.rs
+++ b/library/stdarch/crates/stdarch-test/src/disassembly.rs
@@ -1,7 +1,7 @@
 //! Disassembly calling function for most targets.
 
 use crate::Function;
-use std::{collections::HashSet, env, process::Command, str};
+use std::{collections::HashSet, env, str};
 
 // Extracts the "shim" name from the `symbol`.
 fn normalize(mut symbol: &str) -> String {
@@ -39,10 +39,11 @@ fn normalize(mut symbol: &str) -> String {
     symbol
 }
 
+#[cfg(windows)]
 pub(crate) fn disassemble_myself() -> HashSet<Function> {
     let me = env::current_exe().expect("failed to get current exe");
 
-    let disassembly = if cfg!(target_os = "windows") && cfg!(target_env = "msvc") {
+    let disassembly = if cfg!(target_env = "msvc") {
         let target = if cfg!(target_arch = "x86_64") {
             "x86_64-pc-windows-msvc"
         } else if cfg!(target_arch = "x86") {
@@ -65,32 +66,39 @@ pub(crate) fn disassemble_myself() -> HashSet<Function> {
         assert!(output.status.success());
         // Windows does not return valid UTF-8 output:
         String::from_utf8_lossy(Vec::leak(output.stdout))
-    } else if cfg!(target_os = "windows") {
-        panic!("disassembly unimplemented")
     } else {
-        let objdump = env::var("OBJDUMP").unwrap_or_else(|_| "objdump".to_string());
-        let add_args = if cfg!(target_os = "macos") && cfg!(target_arch = "aarch64") {
-            // Target features need to be enabled for LLVM objdump on Macos ARM64
-            vec!["--mattr=+v8.6a,+crypto,+tme"]
-        } else {
-            vec![]
-        };
-        let output = Command::new(objdump.clone())
-            .arg("--disassemble")
-            .arg("--no-show-raw-insn")
-            .args(add_args)
-            .arg(&me)
-            .output()
-            .unwrap_or_else(|_| panic!("failed to execute objdump. OBJDUMP={objdump}"));
-        println!(
-            "{}\n{}",
-            output.status,
-            String::from_utf8_lossy(&output.stderr)
-        );
-        assert!(output.status.success());
+        panic!("disassembly unimplemented")
+    };
 
-        String::from_utf8_lossy(Vec::leak(output.stdout))
+    parse(&disassembly)
+}
+
+#[cfg(not(windows))]
+pub(crate) fn disassemble_myself() -> HashSet<Function> {
+    let me = env::current_exe().expect("failed to get current exe");
+
+    let objdump = env::var("OBJDUMP").unwrap_or_else(|_| "objdump".to_string());
+    let add_args = if cfg!(target_os = "macos") && cfg!(target_arch = "aarch64") {
+        // Target features need to be enabled for LLVM objdump on Macos ARM64
+        vec!["--mattr=+v8.6a,+crypto,+tme"]
+    } else {
+        vec![]
     };
+    let output = std::process::Command::new(objdump.clone())
+        .arg("--disassemble")
+        .arg("--no-show-raw-insn")
+        .args(add_args)
+        .arg(&me)
+        .output()
+        .unwrap_or_else(|_| panic!("failed to execute objdump. OBJDUMP={objdump}"));
+    println!(
+        "{}\n{}",
+        output.status,
+        String::from_utf8_lossy(&output.stderr)
+    );
+    assert!(output.status.success());
+
+    let disassembly = String::from_utf8_lossy(Vec::leak(output.stdout));
 
     parse(&disassembly)
 }
diff --git a/library/stdarch/crates/stdarch-test/src/lib.rs b/library/stdarch/crates/stdarch-test/src/lib.rs
index 232e47ec1..7ea189ff5 100644
--- a/library/stdarch/crates/stdarch-test/src/lib.rs
+++ b/library/stdarch/crates/stdarch-test/src/lib.rs
@@ -129,17 +129,20 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
                 "usad8" | "vfma" | "vfms" => 27,
                 "qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29,
                 // core_arch/src/arm_shared/simd32
-                // vst1q_s64_x4_vst1 : #instructions = 22 >= 22 (limit)
-                "vld3" => 23,
+                // vst1q_s64_x4_vst1 : #instructions = 27 >= 22 (limit)
+                "vld3" => 28,
                 // core_arch/src/arm_shared/simd32
-                // vld4q_lane_u32_vld4 : #instructions = 31 >= 22 (limit)
-                "vld4" => 32,
+                // vld4q_lane_u32_vld4 : #instructions = 36 >= 22 (limit)
+                "vld4" => 37,
                 // core_arch/src/arm_shared/simd32
                 // vst1q_s64_x4_vst1 : #instructions = 40 >= 22 (limit)
                 "vst1" => 41,
                 // core_arch/src/arm_shared/simd32
-                // vst4q_u32_vst4 : #instructions = 26 >= 22 (limit)
-                "vst4" => 27,
+                // vst3q_u32_vst3 : #instructions = 25 >= 22 (limit)
+                "vst3" => 26,
+                // core_arch/src/arm_shared/simd32
+                // vst4q_u32_vst4 : #instructions = 33 >= 22 (limit)
+                "vst4" => 34,
 
                 // core_arch/src/arm_shared/simd32
                 // vst1q_p64_x4_nop : #instructions = 33 >= 22 (limit)
diff --git a/library/stdarch/examples/connect5.rs b/library/stdarch/examples/connect5.rs
index 09e7e48a7..805108c24 100644
--- a/library/stdarch/examples/connect5.rs
+++ b/library/stdarch/examples/connect5.rs
@@ -851,7 +851,7 @@ fn check_patterndead4(pos: &Pos, sd: Side) -> i32 {
     n
 }
 
-/// Check <b>-OOO-, -OO-O-, -O-OO-</br>
+/// Check <b>-OOO-, -OO-O-, -O-OO-</b>
 fn check_patternlive3(pos: &Pos, sd: Side) -> i32 {
     let mut n: i32 = 0;