9 files changed, 2899 insertions, 0 deletions
diff --git a/library/stdarch/crates/core_arch/src/arm/armclang.rs b/library/stdarch/crates/core_arch/src/arm/armclang.rs
new file mode 100644
index 000000000..e68c02d02
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/armclang.rs
@@ -0,0 +1,35 @@
+//! ARM compiler specific intrinsics
+//!
+//! # References
+//!
+//! - [ARM Compiler v 6.10 - armclang Reference Guide][arm_comp_ref]
+//!
+//! [arm_comp_ref]: https://developer.arm.com/docs/100067/0610
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Inserts a breakpoint instruction.
+///
+/// `VAL` is a compile-time constant integer in range `[0, 255]`.
+///
+/// The breakpoint instruction inserted is `BKPT` on A32/T32.
+///
+/// # Note
+///
+/// [ARM's documentation][arm_docs] defines that `__breakpoint` accepts the
+/// following values for `VAL`:
+///
+/// - `0...65535` when compiling as A32,
+/// - `0...255` when compiling as T32.
+///
+/// The current implementation only accepts values in range `[0, 255]`.
+///
+/// [arm_docs]: https://developer.arm.com/docs/100067/latest/compiler-specific-intrinsics/__breakpoint-intrinsic
+#[cfg_attr(test, assert_instr(bkpt, VAL = 0))]
+#[inline(always)]
+#[rustc_legacy_const_generics(0)]
+pub unsafe fn __breakpoint<const VAL: i32>() {
+    static_assert_imm8!(VAL);
+    crate::arch::asm!("bkpt #{}", const VAL);
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/dsp.rs b/library/stdarch/crates/core_arch/src/arm/dsp.rs
new file mode 100644
index 000000000..6720f97a5
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/dsp.rs
@@ -0,0 +1,384 @@
+//! # References:
+//!
+//! - Section 8.3 "16-bit multiplications"
+//!
+//! Intrinsics that could live here:
+//!
+//! - \[x\] __smulbb
+//! - \[x\] __smulbt
+//! - \[x\] __smultb
+//! - \[x\] __smultt
+//! - \[x\] __smulwb
+//! - \[x\] __smulwt
+//! - \[x\] __qadd
+//! - \[x\] __qsub
+//! - \[x\] __qdbl
+//! - \[x\] __smlabb
+//! - \[x\] __smlabt
+//! - \[x\] __smlatb
+//! - \[x\] __smlatt
+//! - \[x\] __smlawb
+//! - \[x\] __smlawt
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use crate::mem::transmute;
+
+types! {
+    /// ARM-specific 32-bit wide vector of two packed `i16`.
+    pub struct int16x2_t(i16, i16);
+    /// ARM-specific 32-bit wide vector of two packed `u16`.
+    pub struct uint16x2_t(u16, u16);
+}
+
+extern "unadjusted" {
+    #[link_name = "llvm.arm.smulbb"]
+    fn arm_smulbb(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smulbt"]
+    fn arm_smulbt(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smultb"]
+    fn arm_smultb(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smultt"]
+    fn arm_smultt(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smulwb"]
+    fn arm_smulwb(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smulwt"]
+    fn arm_smulwt(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qadd"]
+    fn arm_qadd(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub"]
+    fn arm_qsub(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlabb"]
+    fn arm_smlabb(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlabt"]
+    fn arm_smlabt(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlatb"]
+    fn arm_smlatb(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlatt"]
+    fn arm_smlatt(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlawb"]
+    fn arm_smlawb(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlawt"]
+    fn arm_smlawt(a: i32, b: i32, c: i32) -> i32;
+}
+
+/// Insert a SMULBB instruction
+///
+/// Returns the equivalent of a\[0\] * b\[0\]
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+#[inline]
+#[cfg_attr(test, assert_instr(smulbb))]
+pub unsafe fn __smulbb(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smulbb(transmute(a), transmute(b))
+}
+
+/// Insert a SMULTB instruction
+///
+/// Returns the equivalent of a\[0\] * b\[1\]
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+#[inline]
+#[cfg_attr(test, assert_instr(smultb))]
+pub unsafe fn __smultb(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smultb(transmute(a), transmute(b))
+}
+
+/// Insert a SMULTB instruction
+///
+/// Returns the equivalent of a\[1\] * b\[0\]
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+#[inline]
+#[cfg_attr(test, assert_instr(smulbt))]
+pub unsafe fn __smulbt(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smulbt(transmute(a), transmute(b))
+}
+
+/// Insert a SMULTT instruction
+///
+/// Returns the equivalent of a\[1\] * b\[1\]
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+#[inline]
+#[cfg_attr(test, assert_instr(smultt))]
+pub unsafe fn __smultt(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smultt(transmute(a), transmute(b))
+}
+
+/// Insert a SMULWB instruction
+///
+/// Multiplies the 32-bit signed first operand with the low halfword
+/// (as a 16-bit signed integer) of the second operand.
+/// Return the top 32 bits of the 48-bit product
+#[inline]
+#[cfg_attr(test, assert_instr(smulwb))]
+pub unsafe fn __smulwb(a: int16x2_t, b: i32) -> i32 {
+    arm_smulwb(transmute(a), b)
+}
+
+/// Insert a SMULWT instruction
+///
+/// Multiplies the 32-bit signed first operand with the high halfword
+/// (as a 16-bit signed integer) of the second operand.
+/// Return the top 32 bits of the 48-bit product
+#[inline]
+#[cfg_attr(test, assert_instr(smulwt))]
+pub unsafe fn __smulwt(a: int16x2_t, b: i32) -> i32 {
+    arm_smulwt(transmute(a), b)
+}
+
+/// Signed saturating addition
+///
+/// Returns the 32-bit saturating signed equivalent of a + b.
+/// Sets the Q flag if saturation occurs.
+#[inline]
+#[cfg_attr(test, assert_instr(qadd))]
+pub unsafe fn __qadd(a: i32, b: i32) -> i32 {
+    arm_qadd(a, b)
+}
+
+/// Signed saturating subtraction
+///
+/// Returns the 32-bit saturating signed equivalent of a - b.
+/// Sets the Q flag if saturation occurs.
+#[inline]
+#[cfg_attr(test, assert_instr(qsub))]
+pub unsafe fn __qsub(a: i32, b: i32) -> i32 {
+    arm_qsub(a, b)
+}
+
+/// Insert a QADD instruction
+///
+/// Returns the 32-bit saturating signed equivalent of a + a
+/// Sets the Q flag if saturation occurs.
+#[inline]
+#[cfg_attr(test, assert_instr(qadd))]
+pub unsafe fn __qdbl(a: i32) -> i32 {
+    arm_qadd(a, a)
+}
+
+/// Insert a SMLABB instruction
+///
+/// Returns the equivalent of a\[0\] * b\[0\] + c
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+/// Sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smlabb))]
+pub unsafe fn __smlabb(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+    arm_smlabb(transmute(a), transmute(b), c)
+}
+
+/// Insert a SMLABT instruction
+///
+/// Returns the equivalent of a\[0\] * b\[1\] + c
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+/// Sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smlabt))]
+pub unsafe fn __smlabt(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+    arm_smlabt(transmute(a), transmute(b), c)
+}
+
+/// Insert a SMLATB instruction
+///
+/// Returns the equivalent of a\[1\] * b\[0\] + c
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+/// Sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smlatb))]
+pub unsafe fn __smlatb(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+    arm_smlatb(transmute(a), transmute(b), c)
+}
+
+/// Insert a SMLATT instruction
+///
+/// Returns the equivalent of a\[1\] * b\[1\] + c
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+/// Sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smlatt))]
+pub unsafe fn __smlatt(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+    arm_smlatt(transmute(a), transmute(b), c)
+}
+
+/// Insert a SMLAWB instruction
+///
+/// Returns the equivalent of (a * b\[0\] + (c << 16)) >> 16
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+/// Sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smlawb))]
+pub unsafe fn __smlawb(a: i32, b: int16x2_t, c: i32) -> i32 {
+    arm_smlawb(a, transmute(b), c)
+}
+
+/// Insert a SMLAWT instruction
+///
+/// Returns the equivalent of (a * b\[1\] + (c << 16)) >> 16
+/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits.
+/// Sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smlawt))]
+pub unsafe fn __smlawt(a: i32, b: int16x2_t, c: i32) -> i32 {
+    arm_smlawt(a, transmute(b), c)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::{
+        arm::*,
+        simd::{i16x2, i8x4, u8x4},
+    };
+    use std::mem::transmute;
+    use stdarch_test::simd_test;
+
+    #[test]
+    fn smulbb() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            assert_eq!(super::__smulbb(transmute(a), transmute(b)), 10 * 30);
+        }
+    }
+
+    #[test]
+    fn smulbt() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            assert_eq!(super::__smulbt(transmute(a), transmute(b)), 10 * 40);
+        }
+    }
+
+    #[test]
+    fn smultb() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            assert_eq!(super::__smultb(transmute(a), transmute(b)), 20 * 30);
+        }
+    }
+
+    #[test]
+    fn smultt() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            assert_eq!(super::__smultt(transmute(a), transmute(b)), 20 * 40);
+        }
+    }
+
+    #[test]
+    fn smulwb() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = 30;
+            assert_eq!(super::__smulwb(transmute(a), b), 20 * b);
+        }
+    }
+
+    #[test]
+    fn smulwt() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = 30;
+            assert_eq!(super::__smulwt(transmute(a), b), (10 * b) >> 16);
+        }
+    }
+
+    #[test]
+    fn qadd() {
+        unsafe {
+            assert_eq!(super::__qadd(-10, 60), 50);
+            assert_eq!(super::__qadd(i32::MAX, 10), i32::MAX);
+            assert_eq!(super::__qadd(i32::MIN, -10), i32::MIN);
+        }
+    }
+
+    #[test]
+    fn qsub() {
+        unsafe {
+            assert_eq!(super::__qsub(10, 60), -50);
+            assert_eq!(super::__qsub(i32::MAX, -10), i32::MAX);
+            assert_eq!(super::__qsub(i32::MIN, 10), i32::MIN);
+        }
+    }
+
+    fn qdbl() {
+        unsafe {
+            assert_eq!(super::__qdbl(10), 20);
+            assert_eq!(super::__qdbl(i32::MAX), i32::MAX);
+        }
+    }
+
+    fn smlabb() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            let c = 50;
+            let r = (10 * 30) + c;
+            assert_eq!(super::__smlabb(transmute(a), transmute(b), c), r);
+        }
+    }
+
+    fn smlabt() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            let c = 50;
+            let r = (10 * 40) + c;
+            assert_eq!(super::__smlabt(transmute(a), transmute(b), c), r);
+        }
+    }
+
+    fn smlatb() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            let c = 50;
+            let r = (20 * 30) + c;
+            assert_eq!(super::__smlabt(transmute(a), transmute(b), c), r);
+        }
+    }
+
+    fn smlatt() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(30, 40);
+            let c = 50;
+            let r = (20 * 40) + c;
+            assert_eq!(super::__smlatt(transmute(a), transmute(b), c), r);
+        }
+    }
+
+    fn smlawb() {
+        unsafe {
+            let a: i32 = 10;
+            let b = i16x2::new(30, 40);
+            let c: i32 = 50;
+            let r: i32 = ((a * 30) + (c << 16)) >> 16;
+            assert_eq!(super::__smlawb(a, transmute(b), c), r);
+        }
+    }
+
+    fn smlawt() {
+        unsafe {
+            let a: i32 = 10;
+            let b = i16x2::new(30, 40);
+            let c: i32 = 50;
+            let r: i32 = ((a * 40) + (c << 16)) >> 16;
+            assert_eq!(super::__smlawt(a, transmute(b), c), r);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/ex.rs b/library/stdarch/crates/core_arch/src/arm/ex.rs
new file mode 100644
index 000000000..75f378642
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/ex.rs
@@ -0,0 +1,125 @@
+// Reference: Section 5.4.4 "LDREX / STREX" of ACLE
+
+/// Removes the exclusive lock created by LDREX
+// Supported: v6, v6K, v7-M, v7-A, v7-R
+// Not supported: v5, v6-M
+// NOTE: there's no dedicated CLREX instruction in v6 (<v6k); to clear the exclusive monitor users
+// have to do a dummy STREX operation
+#[cfg(any(
+    all(target_feature = "v6k", not(target_feature = "mclass")), // excludes v6-M
+    all(target_feature = "v7", target_feature = "mclass"), // v7-M
+    doc
+))]
+pub unsafe fn __clrex() {
+    extern "unadjusted" {
+        #[link_name = "llvm.arm.clrex"]
+        fn clrex();
+    }
+
+    clrex()
+}
+
+/// Executes an exclusive LDR instruction for 8 bit value.
+// Supported: v6K, v7-M, v7-A, v7-R
+// Not supported: v5, v6, v6-M
+#[cfg(any(
+    target_feature = "v6k", // includes v7-M but excludes v6-M
+    doc
+))]
+pub unsafe fn __ldrexb(p: *const u8) -> u8 {
+    extern "unadjusted" {
+        #[link_name = "llvm.arm.ldrex.p0i8"]
+        fn ldrex8(p: *const u8) -> u32;
+    }
+
+    ldrex8(p) as u8
+}
+
+/// Executes an exclusive LDR instruction for 16 bit value.
+// Supported: v6K, v7-M, v7-A, v7-R, v8
+// Not supported: v5, v6, v6-M
+#[cfg(any(
+    target_feature = "v6k", // includes v7-M but excludes v6-M
+    doc
+))]
+pub unsafe fn __ldrexh(p: *const u16) -> u16 {
+    extern "unadjusted" {
+        #[link_name = "llvm.arm.ldrex.p0i16"]
+        fn ldrex16(p: *const u16) -> u32;
+    }
+
+    ldrex16(p) as u16
+}
+
+/// Executes an exclusive LDR instruction for 32 bit value.
+// Supported: v6, v7-M, v6K, v7-A, v7-R, v8
+// Not supported: v5, v6-M
+#[cfg(any(
+    all(target_feature = "v6", not(target_feature = "mclass")), // excludes v6-M
+    all(target_feature = "v7", target_feature = "mclass"), // v7-M
+    doc
+))]
+pub unsafe fn __ldrex(p: *const u32) -> u32 {
+    extern "unadjusted" {
+        #[link_name = "llvm.arm.ldrex.p0i32"]
+        fn ldrex32(p: *const u32) -> u32;
+    }
+
+    ldrex32(p)
+}
+
+/// Executes an exclusive STR instruction for 8 bit values
+///
+/// Returns `0` if the operation succeeded, or `1` if it failed
+// supported: v6K, v7-M, v7-A, v7-R
+// Not supported: v5, v6, v6-M
+#[cfg(any(
+    target_feature = "v6k", // includes v7-M but excludes v6-M
+    doc
+))]
+pub unsafe fn __strexb(value: u32, addr: *mut u8) -> u32 {
+    extern "unadjusted" {
+        #[link_name = "llvm.arm.strex.p0i8"]
+        fn strex8(value: u32, addr: *mut u8) -> u32;
+    }
+
+    strex8(value, addr)
+}
+
+/// Executes an exclusive STR instruction for 16 bit values
+///
+/// Returns `0` if the operation succeeded, or `1` if it failed
+// Supported: v6K, v7-M, v7-A, v7-R, v8
+// Not supported: v5, v6, v6-M
+#[cfg(target_feature = "aarch64")]
+#[cfg(any(
+    target_feature = "v6k", // includes v7-M but excludes v6-M
+    doc
+))]
+pub unsafe fn __strexh(value: u16, addr: *mut u16) -> u32 {
+    extern "unadjusted" {
+        #[link_name = "llvm.arm.strex.p0i16"]
+        fn strex16(value: u32, addr: *mut u16) -> u32;
+    }
+
+    strex16(value as u32, addr)
+}
+
+/// Executes an exclusive STR instruction for 32 bit values
+///
+/// Returns `0` if the operation succeeded, or `1` if it failed
+// Supported: v6, v7-M, v6K, v7-A, v7-R, v8
+// Not supported: v5, v6-M
+#[cfg(any(
+    all(target_feature = "v6", not(target_feature = "mclass")), // excludes v6-M
+    all(target_feature = "v7", target_feature = "mclass"), // v7-M
+    doc
+))]
+pub unsafe fn __strex(value: u32, addr: *mut u32) -> u32 {
+    extern "unadjusted" {
+        #[link_name = "llvm.arm.strex.p0i32"]
+        fn strex32(value: u32, addr: *mut u32) -> u32;
+    }
+
+    strex32(value, addr)
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/mod.rs b/library/stdarch/crates/core_arch/src/arm/mod.rs
new file mode 100644
index 000000000..efe0068d4
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/mod.rs
@@ -0,0 +1,113 @@
+//! ARM intrinsics.
+//!
+//! The reference for NEON is [ARM's NEON Intrinsics Reference][arm_ref]. The
+//! [ARM's NEON Intrinsics Online Database][arm_dat] is also useful.
+//!
+//! [arm_ref]: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf
+//! [arm_dat]: https://developer.arm.com/technologies/neon/intrinsics
+
+mod armclang;
+pub use self::armclang::*;
+
+mod v6;
+pub use self::v6::*;
+
+// Supported arches: 6, 7-M. See Section 10.1 of ACLE (e.g. SSAT)
+#[cfg(any(target_feature = "v6", doc))]
+mod sat;
+
+#[cfg(any(target_feature = "v6", doc))]
+pub use self::sat::*;
+
+// Supported arches: 5TE, 7E-M. See Section 10.1 of ACLE (e.g. QADD)
+// We also include the A profile even though DSP is deprecated on that profile as of ACLE 2.0 (see
+// section 5.4.7)
+// Here we workaround the difference between LLVM's +dsp and ACLE's __ARM_FEATURE_DSP by gating on
+// '+v5te' rather than on '+dsp'
+#[cfg(any(
+    // >= v5TE but excludes v7-M
+    all(target_feature = "v5te", not(target_feature = "mclass")),
+    // v7E-M
+    all(target_feature = "mclass", target_feature = "dsp"),
+    doc,
+))]
+pub mod dsp;
+
+#[cfg(any(
+    // >= v5TE but excludes v7-M
+    all(target_feature = "v5te", not(target_feature = "mclass")),
+    // v7E-M
+    all(target_feature = "mclass", target_feature = "dsp"),
+    doc,
+))]
+pub use self::dsp::*;
+
+// Deprecated in ACLE 2.0 for the A profile but fully supported on the M and R profiles, says
+// Section 5.4.9 of ACLE. We'll expose these for the A profile even if deprecated
+#[cfg(any(
+    // v7-A, v7-R
+    all(target_feature = "v6", not(target_feature = "mclass")),
+    // v7E-M
+    all(target_feature = "mclass", target_feature = "dsp"),
+    doc,
+))]
+mod simd32;
+
+#[cfg(any(
+    // v7-A, v7-R
+    all(target_feature = "v6", not(target_feature = "mclass")),
+    // v7E-M
+    all(target_feature = "mclass", target_feature = "dsp"),
+    doc,
+))]
+pub use self::simd32::*;
+
+#[cfg(any(target_feature = "v7", doc))]
+mod v7;
+#[cfg(any(target_feature = "v7", doc))]
+pub use self::v7::*;
+
+mod ex;
+pub use self::ex::*;
+
+pub use crate::core_arch::arm_shared::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[cfg(any(target_feature = "v7", doc))]
+pub(crate) mod neon;
+#[cfg(any(target_feature = "v7", doc))]
+pub use neon::*;
+
+/// Generates the trap instruction `UDF`
+#[cfg(target_arch = "arm")]
+#[cfg_attr(test, assert_instr(udf))]
+#[inline]
+pub unsafe fn udf() -> ! {
+    crate::intrinsics::abort()
+}
+
+/// Generates a DBG instruction.
+///
+/// This provides a hint to debugging and related systems. The argument must be
+/// a constant integer from 0 to 15 inclusive. See implementation documentation
+/// for the effect (if any) of this instruction and the meaning of the
+/// argument. This is available only when compiling for AArch32.
+// Section 10.1 of ACLE says that the supported arches are: 7, 7-M
+// "The DBG hint instruction is added in ARMv7. It is UNDEFINED in the ARMv6 base architecture, and
+// executes as a NOP instruction in ARMv6K and ARMv6T2." - ARM Architecture Reference Manual ARMv7-A
+// and ARMv7-R edition (ARM DDI 0406C.c) sections D12.4.1 "ARM instruction set support" and D12.4.2
+// "Thumb instruction set support"
+#[cfg(any(target_feature = "v7", doc))]
+#[inline(always)]
+#[rustc_legacy_const_generics(0)]
+pub unsafe fn __dbg<const IMM4: i32>() {
+    static_assert_imm4!(IMM4);
+    dbg(IMM4);
+}
+
+extern "unadjusted" {
+    #[link_name = "llvm.arm.dbg"]
+    fn dbg(_: i32);
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/neon.rs b/library/stdarch/crates/core_arch/src/arm/neon.rs
new file mode 100644
index 000000000..a0ad92c33
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/neon.rs
@@ -0,0 +1,1369 @@
+use crate::core_arch::arm_shared::neon::*;
+use crate::core_arch::simd::{f32x4, i32x4, u32x4};
+use crate::core_arch::simd_llvm::*;
+use crate::mem::{align_of, transmute};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+#[allow(non_camel_case_types)]
+pub(crate) type p8 = u8;
+#[allow(non_camel_case_types)]
+pub(crate) type p16 = u16;
+
+#[allow(improper_ctypes)]
+extern "unadjusted" {
+    #[link_name = "llvm.arm.neon.vbsl.v8i8"]
+    fn vbsl_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vbsl.v16i8"]
+    fn vbslq_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t;
+    #[link_name = "llvm.arm.neon.vpadals.v4i16.v8i8"]
+    pub(crate) fn vpadal_s8_(a: int16x4_t, b: int8x8_t) -> int16x4_t;
+    #[link_name = "llvm.arm.neon.vpadals.v2i32.v4i16"]
+    pub(crate) fn vpadal_s16_(a: int32x2_t, b: int16x4_t) -> int32x2_t;
+    #[link_name = "llvm.arm.neon.vpadals.v1i64.v2i32"]
+    pub(crate) fn vpadal_s32_(a: int64x1_t, b: int32x2_t) -> int64x1_t;
+    #[link_name = "llvm.arm.neon.vpadals.v8i16.v16i8"]
+    pub(crate) fn vpadalq_s8_(a: int16x8_t, b: int8x16_t) -> int16x8_t;
+    #[link_name = "llvm.arm.neon.vpadals.v4i32.v8i16"]
+    pub(crate) fn vpadalq_s16_(a: int32x4_t, b: int16x8_t) -> int32x4_t;
+    #[link_name = "llvm.arm.neon.vpadals.v2i64.v4i32"]
+    pub(crate) fn vpadalq_s32_(a: int64x2_t, b: int32x4_t) -> int64x2_t;
+
+    #[link_name = "llvm.arm.neon.vpadalu.v4i16.v8i8"]
+    pub(crate) fn vpadal_u8_(a: uint16x4_t, b: uint8x8_t) -> uint16x4_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v2i32.v4i16"]
+    pub(crate) fn vpadal_u16_(a: uint32x2_t, b: uint16x4_t) -> uint32x2_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v1i64.v2i32"]
+    pub(crate) fn vpadal_u32_(a: uint64x1_t, b: uint32x2_t) -> uint64x1_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v8i16.v16i8"]
+    pub(crate) fn vpadalq_u8_(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v4i32.v8i16"]
+    pub(crate) fn vpadalq_u16_(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t;
+    #[link_name = "llvm.arm.neon.vpadalu.v2i64.v4i32"]
+    pub(crate) fn vpadalq_u32_(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t;
+
+    #[link_name = "llvm.arm.neon.vtbl1"]
+    fn vtbl1(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbl2"]
+    fn vtbl2(a: int8x8_t, b: int8x8_t, b: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbl3"]
+    fn vtbl3(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbl4"]
+    fn vtbl4(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t) -> int8x8_t;
+
+    #[link_name = "llvm.arm.neon.vtbx1"]
+    fn vtbx1(a: int8x8_t, b: int8x8_t, b: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbx2"]
+    fn vtbx2(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbx3"]
+    fn vtbx3(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vtbx4"]
+    fn vtbx4(
+        a: int8x8_t,
+        b: int8x8_t,
+        b: int8x8_t,
+        c: int8x8_t,
+        d: int8x8_t,
+        e: int8x8_t,
+    ) -> int8x8_t;
+
+    #[link_name = "llvm.arm.neon.vshiftins.v8i8"]
+    fn vshiftins_v8i8(a: int8x8_t, b: int8x8_t, shift: int8x8_t) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v16i8"]
+    fn vshiftins_v16i8(a: int8x16_t, b: int8x16_t, shift: int8x16_t) -> int8x16_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v4i16"]
+    fn vshiftins_v4i16(a: int16x4_t, b: int16x4_t, shift: int16x4_t) -> int16x4_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v8i16"]
+    fn vshiftins_v8i16(a: int16x8_t, b: int16x8_t, shift: int16x8_t) -> int16x8_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v2i32"]
+    fn vshiftins_v2i32(a: int32x2_t, b: int32x2_t, shift: int32x2_t) -> int32x2_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v4i32"]
+    fn vshiftins_v4i32(a: int32x4_t, b: int32x4_t, shift: int32x4_t) -> int32x4_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v1i64"]
+    fn vshiftins_v1i64(a: int64x1_t, b: int64x1_t, shift: int64x1_t) -> int64x1_t;
+    #[link_name = "llvm.arm.neon.vshiftins.v2i64"]
+    fn vshiftins_v2i64(a: int64x2_t, b: int64x2_t, shift: int64x2_t) -> int64x2_t;
+
+    #[link_name = "llvm.arm.neon.vld1.v8i8.p0i8"]
+    fn vld1_v8i8(addr: *const i8, align: i32) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vld1.v16i8.p0i8"]
+    fn vld1q_v16i8(addr: *const i8, align: i32) -> int8x16_t;
+    #[link_name = "llvm.arm.neon.vld1.v4i16.p0i8"]
+    fn vld1_v4i16(addr: *const i8, align: i32) -> int16x4_t;
+    #[link_name = "llvm.arm.neon.vld1.v8i16.p0i8"]
+    fn vld1q_v8i16(addr: *const i8, align: i32) -> int16x8_t;
+    #[link_name = "llvm.arm.neon.vld1.v2i32.p0i8"]
+    fn vld1_v2i32(addr: *const i8, align: i32) -> int32x2_t;
+    #[link_name = "llvm.arm.neon.vld1.v4i32.p0i8"]
+    fn vld1q_v4i32(addr: *const i8, align: i32) -> int32x4_t;
+    #[link_name = "llvm.arm.neon.vld1.v1i64.p0i8"]
+    fn vld1_v1i64(addr: *const i8, align: i32) -> int64x1_t;
+    #[link_name = "llvm.arm.neon.vld1.v2i64.p0i8"]
+    fn vld1q_v2i64(addr: *const i8, align: i32) -> int64x2_t;
+    #[link_name = "llvm.arm.neon.vld1.v2f32.p0i8"]
+    fn vld1_v2f32(addr: *const i8, align: i32) -> float32x2_t;
+    #[link_name = "llvm.arm.neon.vld1.v4f32.p0i8"]
+    fn vld1q_v4f32(addr: *const i8, align: i32) -> float32x4_t;
+
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v8i8"]
+    fn vst1_v8i8(addr: *const i8, val: int8x8_t, align: i32);
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v16i8"]
+    fn vst1q_v16i8(addr: *const i8, val: int8x16_t, align: i32);
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v4i16"]
+    fn vst1_v4i16(addr: *const i8, val: int16x4_t, align: i32);
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v8i16"]
+    fn vst1q_v8i16(addr: *const i8, val: int16x8_t, align: i32);
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v2i32"]
+    fn vst1_v2i32(addr: *const i8, val: int32x2_t, align: i32);
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v4i32"]
+    fn vst1q_v4i32(addr: *const i8, val: int32x4_t, align: i32);
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v1i64"]
+    fn vst1_v1i64(addr: *const i8, val: int64x1_t, align: i32);
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v2i64"]
+    fn vst1q_v2i64(addr: *const i8, val: int64x2_t, align: i32);
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v2f32"]
+    fn vst1_v2f32(addr: *const i8, val: float32x2_t, align: i32);
+    #[link_name = "llvm.arm.neon.vst1.p0i8.v4f32"]
+    fn vst1q_v4f32(addr: *const i8, val: float32x4_t, align: i32);
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1_s8(ptr: *const i8) -> int8x8_t {
+    vld1_v8i8(ptr as *const i8, align_of::<i8>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1q_s8(ptr: *const i8) -> int8x16_t {
+    vld1q_v16i8(ptr as *const i8, align_of::<i8>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1_s16(ptr: *const i16) -> int16x4_t {
+    vld1_v4i16(ptr as *const i8, align_of::<i16>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1q_s16(ptr: *const i16) -> int16x8_t {
+    vld1q_v8i16(ptr as *const i8, align_of::<i16>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_s32(ptr: *const i32) -> int32x2_t {
+    vld1_v2i32(ptr as *const i8, align_of::<i32>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.32"))]
+pub unsafe fn vld1q_s32(ptr: *const i32) -> int32x4_t {
+    vld1q_v4i32(ptr as *const i8, align_of::<i32>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_s64(ptr: *const i64) -> int64x1_t {
+    vld1_v1i64(ptr as *const i8, align_of::<i64>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.64"))]
+pub unsafe fn vld1q_s64(ptr: *const i64) -> int64x2_t {
+    vld1q_v2i64(ptr as *const i8, align_of::<i64>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1_u8(ptr: *const u8) -> uint8x8_t {
+    transmute(vld1_v8i8(ptr as *const i8, align_of::<u8>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1q_u8(ptr: *const u8) -> uint8x16_t {
+    transmute(vld1q_v16i8(ptr as *const i8, align_of::<u8>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1_u16(ptr: *const u16) -> uint16x4_t {
+    transmute(vld1_v4i16(ptr as *const i8, align_of::<u16>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1q_u16(ptr: *const u16) -> uint16x8_t {
+    transmute(vld1q_v8i16(ptr as *const i8, align_of::<u16>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_u32(ptr: *const u32) -> uint32x2_t {
+    transmute(vld1_v2i32(ptr as *const i8, align_of::<u32>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.32"))]
+pub unsafe fn vld1q_u32(ptr: *const u32) -> uint32x4_t {
+    transmute(vld1q_v4i32(ptr as *const i8, align_of::<u32>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_u64(ptr: *const u64) -> uint64x1_t {
+    transmute(vld1_v1i64(ptr as *const i8, align_of::<u64>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.64"))]
+pub unsafe fn vld1q_u64(ptr: *const u64) -> uint64x2_t {
+    transmute(vld1q_v2i64(ptr as *const i8, align_of::<u64>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1_p8(ptr: *const p8) -> poly8x8_t {
+    transmute(vld1_v8i8(ptr as *const i8, align_of::<p8>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1q_p8(ptr: *const p8) -> poly8x16_t {
+    transmute(vld1q_v16i8(ptr as *const i8, align_of::<p8>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1_p16(ptr: *const p16) -> poly16x4_t {
+    transmute(vld1_v4i16(ptr as *const i8, align_of::<p16>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1q_p16(ptr: *const p16) -> poly16x8_t {
+    transmute(vld1q_v8i16(ptr as *const i8, align_of::<p16>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_p64(ptr: *const p64) -> poly64x1_t {
+    transmute(vld1_v1i64(ptr as *const i8, align_of::<p64>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,aes")]
+#[cfg_attr(test, assert_instr("vld1.64"))]
+pub unsafe fn vld1q_p64(ptr: *const p64) -> poly64x2_t {
+    transmute(vld1q_v2i64(ptr as *const i8, align_of::<p64>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_f32(ptr: *const f32) -> float32x2_t {
+    vld1_v2f32(ptr as *const i8, align_of::<f32>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.32"))]
+pub unsafe fn vld1q_f32(ptr: *const f32) -> float32x4_t {
+    vld1q_v4f32(ptr as *const i8, align_of::<f32>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.8"))]
+pub unsafe fn vst1_s8(ptr: *mut i8, a: int8x8_t) {
+    vst1_v8i8(ptr as *const i8, a, align_of::<i8>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.8"))]
+pub unsafe fn vst1q_s8(ptr: *mut i8, a: int8x16_t) {
+    vst1q_v16i8(ptr as *const i8, a, align_of::<i8>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.16"))]
+pub unsafe fn vst1_s16(ptr: *mut i16, a: int16x4_t) {
+    vst1_v4i16(ptr as *const i8, a, align_of::<i16>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.16"))]
+pub unsafe fn vst1q_s16(ptr: *mut i16, a: int16x8_t) {
+    vst1q_v8i16(ptr as *const i8, a, align_of::<i16>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.32"))]
+pub unsafe fn vst1_s32(ptr: *mut i32, a: int32x2_t) {
+    vst1_v2i32(ptr as *const i8, a, align_of::<i32>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.32"))]
+pub unsafe fn vst1q_s32(ptr: *mut i32, a: int32x4_t) {
+    vst1q_v4i32(ptr as *const i8, a, align_of::<i32>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.64"))]
+pub unsafe fn vst1_s64(ptr: *mut i64, a: int64x1_t) {
+    vst1_v1i64(ptr as *const i8, a, align_of::<i64>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.64"))]
+pub unsafe fn vst1q_s64(ptr: *mut i64, a: int64x2_t) {
+    vst1q_v2i64(ptr as *const i8, a, align_of::<i64>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.8"))]
+pub unsafe fn vst1_u8(ptr: *mut u8, a: uint8x8_t) {
+    vst1_v8i8(ptr as *const i8, transmute(a), align_of::<u8>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.8"))]
+pub unsafe fn vst1q_u8(ptr: *mut u8, a: uint8x16_t) {
+    vst1q_v16i8(ptr as *const i8, transmute(a), align_of::<u8>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.16"))]
+pub unsafe fn vst1_u16(ptr: *mut u16, a: uint16x4_t) {
+    vst1_v4i16(ptr as *const i8, transmute(a), align_of::<u16>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.16"))]
+pub unsafe fn vst1q_u16(ptr: *mut u16, a: uint16x8_t) {
+    vst1q_v8i16(ptr as *const i8, transmute(a), align_of::<u16>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.32"))]
+pub unsafe fn vst1_u32(ptr: *mut u32, a: uint32x2_t) {
+    vst1_v2i32(ptr as *const i8, transmute(a), align_of::<u32>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.32"))]
+pub unsafe fn vst1q_u32(ptr: *mut u32, a: uint32x4_t) {
+    vst1q_v4i32(ptr as *const i8, transmute(a), align_of::<u32>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.64"))]
+pub unsafe fn vst1_u64(ptr: *mut u64, a: uint64x1_t) {
+    vst1_v1i64(ptr as *const i8, transmute(a), align_of::<u64>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.64"))]
+pub unsafe fn vst1q_u64(ptr: *mut u64, a: uint64x2_t) {
+    vst1q_v2i64(ptr as *const i8, transmute(a), align_of::<u64>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.8"))]
+pub unsafe fn vst1_p8(ptr: *mut p8, a: poly8x8_t) {
+    vst1_v8i8(ptr as *const i8, transmute(a), align_of::<p8>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.8"))]
+pub unsafe fn vst1q_p8(ptr: *mut p8, a: poly8x16_t) {
+    vst1q_v16i8(ptr as *const i8, transmute(a), align_of::<p8>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.16"))]
+pub unsafe fn vst1_p16(ptr: *mut p16, a: poly16x4_t) {
+    vst1_v4i16(ptr as *const i8, transmute(a), align_of::<p16>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.16"))]
+pub unsafe fn vst1q_p16(ptr: *mut p16, a: poly16x8_t) {
+    vst1q_v8i16(ptr as *const i8, transmute(a), align_of::<p8>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,aes,v8")]
+#[cfg_attr(test, assert_instr("vst1.64"))]
+pub unsafe fn vst1_p64(ptr: *mut p64, a: poly64x1_t) {
+    vst1_v1i64(ptr as *const i8, transmute(a), align_of::<p64>() as i32)
+}
+
+/// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,aes,v8")]
+#[cfg_attr(test, assert_instr("vst1.64"))]
+pub unsafe fn vst1q_p64(ptr: *mut p64, a: poly64x2_t) {
+    vst1q_v2i64(ptr as *const i8, transmute(a), align_of::<p64>() as i32)
+}
+
+// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.32"))]
+pub unsafe fn vst1_f32(ptr: *mut f32, a: float32x2_t) {
+    vst1_v2f32(ptr as *const i8, a, align_of::<f32>() as i32)
+}
+
+// Store multiple single-element structures from one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vst1.32"))]
+pub unsafe fn vst1q_f32(ptr: *mut f32, a: float32x4_t) {
+    vst1q_v4f32(ptr as *const i8, a, align_of::<f32>() as i32)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    vtbl1(a, b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    transmute(vtbl1(transmute(a), transmute(b)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl1_p8(a: poly8x8_t, b: uint8x8_t) -> poly8x8_t {
+    transmute(vtbl1(transmute(a), transmute(b)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl2_s8(a: int8x8x2_t, b: int8x8_t) -> int8x8_t {
+    vtbl2(a.0, a.1, b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl2_u8(a: uint8x8x2_t, b: uint8x8_t) -> uint8x8_t {
+    transmute(vtbl2(transmute(a.0), transmute(a.1), transmute(b)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl2_p8(a: poly8x8x2_t, b: uint8x8_t) -> poly8x8_t {
+    transmute(vtbl2(transmute(a.0), transmute(a.1), transmute(b)))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl3_s8(a: int8x8x3_t, b: int8x8_t) -> int8x8_t {
+    vtbl3(a.0, a.1, a.2, b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl3_u8(a: uint8x8x3_t, b: uint8x8_t) -> uint8x8_t {
+    transmute(vtbl3(
+        transmute(a.0),
+        transmute(a.1),
+        transmute(a.2),
+        transmute(b),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl3_p8(a: poly8x8x3_t, b: uint8x8_t) -> poly8x8_t {
+    transmute(vtbl3(
+        transmute(a.0),
+        transmute(a.1),
+        transmute(a.2),
+        transmute(b),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl4_s8(a: int8x8x4_t, b: int8x8_t) -> int8x8_t {
+    vtbl4(a.0, a.1, a.2, a.3, b)
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl4_u8(a: uint8x8x4_t, b: uint8x8_t) -> uint8x8_t {
+    transmute(vtbl4(
+        transmute(a.0),
+        transmute(a.1),
+        transmute(a.2),
+        transmute(a.3),
+        transmute(b),
+    ))
+}
+
+/// Table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbl))]
+pub unsafe fn vtbl4_p8(a: poly8x8x4_t, b: uint8x8_t) -> poly8x8_t {
+    transmute(vtbl4(
+        transmute(a.0),
+        transmute(a.1),
+        transmute(a.2),
+        transmute(a.3),
+        transmute(b),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx1_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    vtbx1(a, b, c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx1_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    transmute(vtbx1(transmute(a), transmute(b), transmute(c)))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx1_p8(a: poly8x8_t, b: poly8x8_t, c: uint8x8_t) -> poly8x8_t {
+    transmute(vtbx1(transmute(a), transmute(b), transmute(c)))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx2_s8(a: int8x8_t, b: int8x8x2_t, c: int8x8_t) -> int8x8_t {
+    vtbx2(a, b.0, b.1, c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx2_u8(a: uint8x8_t, b: uint8x8x2_t, c: uint8x8_t) -> uint8x8_t {
+    transmute(vtbx2(
+        transmute(a),
+        transmute(b.0),
+        transmute(b.1),
+        transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx2_p8(a: poly8x8_t, b: poly8x8x2_t, c: uint8x8_t) -> poly8x8_t {
+    transmute(vtbx2(
+        transmute(a),
+        transmute(b.0),
+        transmute(b.1),
+        transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx3_s8(a: int8x8_t, b: int8x8x3_t, c: int8x8_t) -> int8x8_t {
+    vtbx3(a, b.0, b.1, b.2, c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx3_u8(a: uint8x8_t, b: uint8x8x3_t, c: uint8x8_t) -> uint8x8_t {
+    transmute(vtbx3(
+        transmute(a),
+        transmute(b.0),
+        transmute(b.1),
+        transmute(b.2),
+        transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx3_p8(a: poly8x8_t, b: poly8x8x3_t, c: uint8x8_t) -> poly8x8_t {
+    transmute(vtbx3(
+        transmute(a),
+        transmute(b.0),
+        transmute(b.1),
+        transmute(b.2),
+        transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx4_s8(a: int8x8_t, b: int8x8x4_t, c: int8x8_t) -> int8x8_t {
+    vtbx4(a, b.0, b.1, b.2, b.3, c)
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx4_u8(a: uint8x8_t, b: uint8x8x4_t, c: uint8x8_t) -> uint8x8_t {
+    transmute(vtbx4(
+        transmute(a),
+        transmute(b.0),
+        transmute(b.1),
+        transmute(b.2),
+        transmute(b.3),
+        transmute(c),
+    ))
+}
+
+/// Extended table look-up
+#[inline]
+#[cfg(target_endian = "little")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vtbx))]
+pub unsafe fn vtbx4_p8(a: poly8x8_t, b: poly8x8x4_t, c: uint8x8_t) -> poly8x8_t {
+    transmute(vtbx4(
+        transmute(a),
+        transmute(b.0),
+        transmute(b.1),
+        transmute(b.2),
+        transmute(b.3),
+        transmute(c),
+    ))
+}
+
+// These float-to-int implementations have undefined behaviour when `a` overflows
+// the destination type. Clang has the same problem: https://llvm.org/PR47510
+
+/// Floating-point Convert to Signed fixed-point, rounding toward Zero (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[target_feature(enable = "v7")]
+#[cfg_attr(test, assert_instr("vcvt.s32.f32"))]
+pub unsafe fn vcvtq_s32_f32(a: float32x4_t) -> int32x4_t {
+    transmute(simd_cast::<_, i32x4>(transmute::<_, f32x4>(a)))
+}
+
+/// Floating-point Convert to Unsigned fixed-point, rounding toward Zero (vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[target_feature(enable = "v7")]
+#[cfg_attr(test, assert_instr("vcvt.u32.f32"))]
+pub unsafe fn vcvtq_u32_f32(a: float32x4_t) -> uint32x4_t {
+    transmute(simd_cast::<_, u32x4>(transmute::<_, f32x4>(a)))
+}
+
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert_imm3!(N);
+    let n = N as i8;
+    vshiftins_v8i8(a, b, int8x8_t(n, n, n, n, n, n, n, n))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert_imm3!(N);
+    let n = N as i8;
+    vshiftins_v16i8(
+        a,
+        b,
+        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
+    )
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert_imm4!(N);
+    let n = N as i16;
+    vshiftins_v4i16(a, b, int16x4_t(n, n, n, n))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert_imm4!(N);
+    let n = N as i16;
+    vshiftins_v8i16(a, b, int16x8_t(n, n, n, n, n, n, n, n))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(N: i32 where N >= 0 && N <= 31);
+    vshiftins_v2i32(a, b, int32x2_t(N, N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N: i32 where N >= 0 && N <= 31);
+    vshiftins_v4i32(a, b, int32x4_t(N, N, N, N))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where 0 <= N && N <= 63);
+    vshiftins_v1i64(a, b, int64x1_t(N as i64))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where 0 <= N && N <= 63);
+    vshiftins_v2i64(a, b, int64x2_t(N as i64, N as i64))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert_imm3!(N);
+    let n = N as i8;
+    transmute(vshiftins_v8i8(
+        transmute(a),
+        transmute(b),
+        int8x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert_imm3!(N);
+    let n = N as i8;
+    transmute(vshiftins_v16i8(
+        transmute(a),
+        transmute(b),
+        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert_imm4!(N);
+    let n = N as i16;
+    transmute(vshiftins_v4i16(
+        transmute(a),
+        transmute(b),
+        int16x4_t(n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert_imm4!(N);
+    let n = N as i16;
+    transmute(vshiftins_v8i16(
+        transmute(a),
+        transmute(b),
+        int16x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(N: i32 where N >= 0 && N <= 31);
+    transmute(vshiftins_v2i32(transmute(a), transmute(b), int32x2_t(N, N)))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(N: i32 where N >= 0 && N <= 31);
+    transmute(vshiftins_v4i32(
+        transmute(a),
+        transmute(b),
+        int32x4_t(N, N, N, N),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where 0 <= N && N <= 63);
+    transmute(vshiftins_v1i64(
+        transmute(a),
+        transmute(b),
+        int64x1_t(N as i64),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where 0 <= N && N <= 63);
+    transmute(vshiftins_v2i64(
+        transmute(a),
+        transmute(b),
+        int64x2_t(N as i64, N as i64),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_p8<const N: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    static_assert_imm3!(N);
+    let n = N as i8;
+    transmute(vshiftins_v8i8(
+        transmute(a),
+        transmute(b),
+        int8x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_p8<const N: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    static_assert_imm3!(N);
+    let n = N as i8;
+    transmute(vshiftins_v16i8(
+        transmute(a),
+        transmute(b),
+        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_p16<const N: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    static_assert_imm4!(N);
+    let n = N as i16;
+    transmute(vshiftins_v4i16(
+        transmute(a),
+        transmute(b),
+        int16x4_t(n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsli.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    static_assert_imm4!(N);
+    let n = N as i16;
+    transmute(vshiftins_v8i16(
+        transmute(a),
+        transmute(b),
+        int16x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7,aes")]
+#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsli_n_p64<const N: i32>(a: poly64x1_t, b: poly64x1_t) -> poly64x1_t {
+    static_assert!(N : i32 where 0 <= N && N <= 63);
+    transmute(vshiftins_v1i64(
+        transmute(a),
+        transmute(b),
+        int64x1_t(N as i64),
+    ))
+}
+/// Shift Left and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7,aes")]
+#[cfg_attr(test, assert_instr("vsli.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsliq_n_p64<const N: i32>(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    static_assert!(N : i32 where 0 <= N && N <= 63);
+    transmute(vshiftins_v2i64(
+        transmute(a),
+        transmute(b),
+        int64x2_t(N as i64, N as i64),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert!(N : i32 where 1 <= N && N <= 8);
+    let n = -N as i8;
+    vshiftins_v8i8(a, b, int8x8_t(n, n, n, n, n, n, n, n))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert!(N : i32 where 1 <= N && N <= 8);
+    let n = -N as i8;
+    vshiftins_v16i8(
+        a,
+        b,
+        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
+    )
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert!(N : i32 where 1 <= N && N <= 16);
+    let n = -N as i16;
+    vshiftins_v4i16(a, b, int16x4_t(n, n, n, n))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert!(N : i32 where 1 <= N && N <= 16);
+    let n = -N as i16;
+    vshiftins_v8i16(a, b, int16x8_t(n, n, n, n, n, n, n, n))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(N : i32 where 1 <= N && N <= 32);
+    vshiftins_v2i32(a, b, int32x2_t(-N, -N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N : i32 where 1 <= N && N <= 32);
+    vshiftins_v4i32(a, b, int32x4_t(-N, -N, -N, -N))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where 1 <= N && N <= 64);
+    vshiftins_v1i64(a, b, int64x1_t(-N as i64))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where 1 <= N && N <= 64);
+    vshiftins_v2i64(a, b, int64x2_t(-N as i64, -N as i64))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where 1 <= N && N <= 8);
+    let n = -N as i8;
+    transmute(vshiftins_v8i8(
+        transmute(a),
+        transmute(b),
+        int8x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert!(N : i32 where 1 <= N && N <= 8);
+    let n = -N as i8;
+    transmute(vshiftins_v16i8(
+        transmute(a),
+        transmute(b),
+        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where 1 <= N && N <= 16);
+    let n = -N as i16;
+    transmute(vshiftins_v4i16(
+        transmute(a),
+        transmute(b),
+        int16x4_t(n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert!(N : i32 where 1 <= N && N <= 16);
+    let n = -N as i16;
+    transmute(vshiftins_v8i16(
+        transmute(a),
+        transmute(b),
+        int16x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where 1 <= N && N <= 32);
+    transmute(vshiftins_v2i32(
+        transmute(a),
+        transmute(b),
+        int32x2_t(-N, -N),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.32", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(N : i32 where 1 <= N && N <= 32);
+    transmute(vshiftins_v4i32(
+        transmute(a),
+        transmute(b),
+        int32x4_t(-N, -N, -N, -N),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where 1 <= N && N <= 64);
+    transmute(vshiftins_v1i64(
+        transmute(a),
+        transmute(b),
+        int64x1_t(-N as i64),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where 1 <= N && N <= 64);
+    transmute(vshiftins_v2i64(
+        transmute(a),
+        transmute(b),
+        int64x2_t(-N as i64, -N as i64),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_p8<const N: i32>(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t {
+    static_assert!(N : i32 where 1 <= N && N <= 8);
+    let n = -N as i8;
+    transmute(vshiftins_v8i8(
+        transmute(a),
+        transmute(b),
+        int8x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.8", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_p8<const N: i32>(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t {
+    static_assert!(N : i32 where 1 <= N && N <= 8);
+    let n = -N as i8;
+    transmute(vshiftins_v16i8(
+        transmute(a),
+        transmute(b),
+        int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_p16<const N: i32>(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t {
+    static_assert!(N : i32 where 1 <= N && N <= 16);
+    let n = -N as i16;
+    transmute(vshiftins_v4i16(
+        transmute(a),
+        transmute(b),
+        int16x4_t(n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vsri.16", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_p16<const N: i32>(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t {
+    static_assert!(N : i32 where 1 <= N && N <= 16);
+    let n = -N as i16;
+    transmute(vshiftins_v8i16(
+        transmute(a),
+        transmute(b),
+        int16x8_t(n, n, n, n, n, n, n, n),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7,aes")]
+#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsri_n_p64<const N: i32>(a: poly64x1_t, b: poly64x1_t) -> poly64x1_t {
+    static_assert!(N : i32 where 1 <= N && N <= 64);
+    transmute(vshiftins_v1i64(
+        transmute(a),
+        transmute(b),
+        int64x1_t(-N as i64),
+    ))
+}
+/// Shift Right and Insert (immediate)
+#[inline]
+#[target_feature(enable = "neon,v7,aes")]
+#[cfg_attr(test, assert_instr("vsri.64", N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsriq_n_p64<const N: i32>(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t {
+    static_assert!(N : i32 where 1 <= N && N <= 64);
+    transmute(vshiftins_v2i64(
+        transmute(a),
+        transmute(b),
+        int64x2_t(-N as i64, -N as i64),
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::core_arch::{arm::*, simd::*};
+    use crate::mem::transmute;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_s32_f32() {
+        let f = f32x4::new(-1., 2., 3., 4.);
+        let e = i32x4::new(-1, 2, 3, 4);
+        let r: i32x4 = transmute(vcvtq_s32_f32(transmute(f)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcvtq_u32_f32() {
+        let f = f32x4::new(1., 2., 3., 4.);
+        let e = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vcvtq_u32_f32(transmute(f)));
+        assert_eq!(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/sat.rs b/library/stdarch/crates/core_arch/src/arm/sat.rs
new file mode 100644
index 000000000..38c98d734
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/sat.rs
@@ -0,0 +1,8 @@
+//! # References:
+//!
+//! - Section 8.4 "Saturating intrinsics"
+//!
+//! Intrinsics that could live here:
+//!
+//! - __ssat
+//! - __usat
diff --git a/library/stdarch/crates/core_arch/src/arm/simd32.rs b/library/stdarch/crates/core_arch/src/arm/simd32.rs
new file mode 100644
index 000000000..2d867acc8
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/simd32.rs
@@ -0,0 +1,728 @@
+//! # References
+//!
+//! - Section 8.5 "32-bit SIMD intrinsics" of ACLE
+//!
+//! Intrinsics that could live here
+//!
+//! - \[x\] __sel
+//! - \[ \] __ssat16
+//! - \[ \] __usat16
+//! - \[ \] __sxtab16
+//! - \[ \] __sxtb16
+//! - \[ \] __uxtab16
+//! - \[ \] __uxtb16
+//! - \[x\] __qadd8
+//! - \[x\] __qsub8
+//! - \[x\] __sadd8
+//! - \[x\] __shadd8
+//! - \[x\] __shsub8
+//! - \[x\] __ssub8
+//! - \[ \] __uadd8
+//! - \[ \] __uhadd8
+//! - \[ \] __uhsub8
+//! - \[ \] __uqadd8
+//! - \[ \] __uqsub8
+//! - \[x\] __usub8
+//! - \[x\] __usad8
+//! - \[x\] __usada8
+//! - \[x\] __qadd16
+//! - \[x\] __qasx
+//! - \[x\] __qsax
+//! - \[x\] __qsub16
+//! - \[x\] __sadd16
+//! - \[x\] __sasx
+//! - \[x\] __shadd16
+//! - \[ \] __shasx
+//! - \[ \] __shsax
+//! - \[x\] __shsub16
+//! - \[ \] __ssax
+//! - \[ \] __ssub16
+//! - \[ \] __uadd16
+//! - \[ \] __uasx
+//! - \[ \] __uhadd16
+//! - \[ \] __uhasx
+//! - \[ \] __uhsax
+//! - \[ \] __uhsub16
+//! - \[ \] __uqadd16
+//! - \[ \] __uqasx
+//! - \[x\] __uqsax
+//! - \[ \] __uqsub16
+//! - \[ \] __usax
+//! - \[ \] __usub16
+//! - \[x\] __smlad
+//! - \[ \] __smladx
+//! - \[ \] __smlald
+//! - \[ \] __smlaldx
+//! - \[x\] __smlsd
+//! - \[ \] __smlsdx
+//! - \[ \] __smlsld
+//! - \[ \] __smlsldx
+//! - \[x\] __smuad
+//! - \[x\] __smuadx
+//! - \[x\] __smusd
+//! - \[x\] __smusdx
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use crate::{core_arch::arm::dsp::int16x2_t, mem::transmute};
+
+types! {
+    /// ARM-specific 32-bit wide vector of four packed `i8`.
+    pub struct int8x4_t(i8, i8, i8, i8);
+    /// ARM-specific 32-bit wide vector of four packed `u8`.
+    pub struct uint8x4_t(u8, u8, u8, u8);
+}
+
+macro_rules! dsp_call {
+    ($name:expr, $a:expr, $b:expr) => {
+        transmute($name(transmute($a), transmute($b)))
+    };
+}
+
+extern "unadjusted" {
+    #[link_name = "llvm.arm.qadd8"]
+    fn arm_qadd8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub8"]
+    fn arm_qsub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub16"]
+    fn arm_qsub16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qadd16"]
+    fn arm_qadd16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qasx"]
+    fn arm_qasx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsax"]
+    fn arm_qsax(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.sadd16"]
+    fn arm_sadd16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.sadd8"]
+    fn arm_sadd8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlad"]
+    fn arm_smlad(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlsd"]
+    fn arm_smlsd(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.sasx"]
+    fn arm_sasx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.sel"]
+    fn arm_sel(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shadd8"]
+    fn arm_shadd8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shadd16"]
+    fn arm_shadd16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shsub8"]
+    fn arm_shsub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.ssub8"]
+    fn arm_ssub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.usub8"]
+    fn arm_usub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shsub16"]
+    fn arm_shsub16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smuad"]
+    fn arm_smuad(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smuadx"]
+    fn arm_smuadx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smusd"]
+    fn arm_smusd(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smusdx"]
+    fn arm_smusdx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.usad8"]
+    fn arm_usad8(a: i32, b: i32) -> u32;
+}
+
+/// Saturating four 8-bit integer additions
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] + b\[0\]
+/// res\[1\] = a\[1\] + b\[1\]
+/// res\[2\] = a\[2\] + b\[2\]
+/// res\[3\] = a\[3\] + b\[3\]
+#[inline]
+#[cfg_attr(test, assert_instr(qadd8))]
+pub unsafe fn __qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_qadd8, a, b)
+}
+
+/// Saturating two 8-bit integer subtraction
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] - b\[0\]
+/// res\[1\] = a\[1\] - b\[1\]
+/// res\[2\] = a\[2\] - b\[2\]
+/// res\[3\] = a\[3\] - b\[3\]
+#[inline]
+#[cfg_attr(test, assert_instr(qsub8))]
+pub unsafe fn __qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_qsub8, a, b)
+}
+
+/// Saturating two 16-bit integer subtraction
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] - b\[0\]
+/// res\[1\] = a\[1\] - b\[1\]
+#[inline]
+#[cfg_attr(test, assert_instr(qsub16))]
+pub unsafe fn __qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qsub16, a, b)
+}
+
+/// Saturating two 16-bit integer additions
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] + b\[0\]
+/// res\[1\] = a\[1\] + b\[1\]
+#[inline]
+#[cfg_attr(test, assert_instr(qadd16))]
+pub unsafe fn __qadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qadd16, a, b)
+}
+
+/// Returns the 16-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] - b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+#[inline]
+#[cfg_attr(test, assert_instr(qasx))]
+pub unsafe fn __qasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qasx, a, b)
+}
+
+/// Returns the 16-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] + b\[1\]
+/// res\[1\] = a\[1\] - b\[0\]
+#[inline]
+#[cfg_attr(test, assert_instr(qsax))]
+pub unsafe fn __qsax(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qsax, a, b)
+}
+
+/// Returns the 16-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] + b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+///
+/// and the GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(sadd16))]
+pub unsafe fn __sadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_sadd16, a, b)
+}
+
+/// Returns the 8-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] + b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+/// res\[2\] = a\[2\] + b\[2\]
+/// res\[3\] = a\[3\] + b\[3\]
+///
+/// and the GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(sadd8))]
+pub unsafe fn __sadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_sadd8, a, b)
+}
+
+/// Dual 16-bit Signed Multiply with Addition of products
+/// and 32-bit accumulation.
+///
+/// Returns the 16-bit signed equivalent of
+/// res = a\[0\] * b\[0\] + a\[1\] * b\[1\] + c
+#[inline]
+#[cfg_attr(test, assert_instr(smlad))]
+pub unsafe fn __smlad(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+    arm_smlad(transmute(a), transmute(b), c)
+}
+
+/// Dual 16-bit Signed Multiply with Subtraction  of products
+/// and 32-bit accumulation and overflow detection.
+///
+/// Returns the 16-bit signed equivalent of
+/// res = a\[0\] * b\[0\] - a\[1\] * b\[1\] + c
+#[inline]
+#[cfg_attr(test, assert_instr(smlsd))]
+pub unsafe fn __smlsd(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+    arm_smlsd(transmute(a), transmute(b), c)
+}
+
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] - b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+///
+/// and the GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(sasx))]
+pub unsafe fn __sasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_sasx, a, b)
+}
+
+/// Select bytes from each operand according to APSR GE flags
+///
+/// Returns the equivalent of
+///
+/// res\[0\] = GE\[0\] ? a\[0\] : b\[0\]
+/// res\[1\] = GE\[1\] ? a\[1\] : b\[1\]
+/// res\[2\] = GE\[2\] ? a\[2\] : b\[2\]
+/// res\[3\] = GE\[3\] ? a\[3\] : b\[3\]
+///
+/// where GE are bits of APSR
+#[inline]
+#[cfg_attr(test, assert_instr(sel))]
+pub unsafe fn __sel(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_sel, a, b)
+}
+
+/// Signed halving parallel byte-wise addition.
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] + b\[0\]) / 2
+/// res\[1\] = (a\[1\] + b\[1\]) / 2
+/// res\[2\] = (a\[2\] + b\[2\]) / 2
+/// res\[3\] = (a\[3\] + b\[3\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shadd8))]
+pub unsafe fn __shadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_shadd8, a, b)
+}
+
+/// Signed halving parallel halfword-wise addition.
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] + b\[0\]) / 2
+/// res\[1\] = (a\[1\] + b\[1\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shadd16))]
+pub unsafe fn __shadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_shadd16, a, b)
+}
+
+/// Signed halving parallel byte-wise subtraction.
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] - b\[0\]) / 2
+/// res\[1\] = (a\[1\] - b\[1\]) / 2
+/// res\[2\] = (a\[2\] - b\[2\]) / 2
+/// res\[3\] = (a\[3\] - b\[3\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shsub8))]
+pub unsafe fn __shsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_shsub8, a, b)
+}
+
+/// Inserts a `USUB8` instruction.
+///
+/// Returns the 8-bit unsigned equivalent of
+///
+/// res\[0\] = a\[0\] - a\[0\]
+/// res\[1\] = a\[1\] - a\[1\]
+/// res\[2\] = a\[2\] - a\[2\]
+/// res\[3\] = a\[3\] - a\[3\]
+///
+/// where \[0\] is the lower 8 bits and \[3\] is the upper 8 bits.
+/// The GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(usub8))]
+pub unsafe fn __usub8(a: uint8x4_t, b: uint8x4_t) -> uint8x4_t {
+    dsp_call!(arm_usub8, a, b)
+}
+
+/// Inserts a `SSUB8` instruction.
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] - a\[0\]
+/// res\[1\] = a\[1\] - a\[1\]
+/// res\[2\] = a\[2\] - a\[2\]
+/// res\[3\] = a\[3\] - a\[3\]
+///
+/// where \[0\] is the lower 8 bits and \[3\] is the upper 8 bits.
+/// The GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(ssub8))]
+pub unsafe fn __ssub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_ssub8, a, b)
+}
+
+/// Signed halving parallel halfword-wise subtraction.
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] - b\[0\]) / 2
+/// res\[1\] = (a\[1\] - b\[1\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shsub16))]
+pub unsafe fn __shsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_shsub16, a, b)
+}
+
+/// Signed Dual Multiply Add.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[0\] + a\[1\] * b\[1\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smuad))]
+pub unsafe fn __smuad(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smuad(transmute(a), transmute(b))
+}
+
+/// Signed Dual Multiply Add Reversed.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[1\] + a\[1\] * b\[0\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smuadx))]
+pub unsafe fn __smuadx(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smuadx(transmute(a), transmute(b))
+}
+
+/// Signed Dual Multiply Subtract.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[0\] - a\[1\] * b\[1\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smusd))]
+pub unsafe fn __smusd(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smusd(transmute(a), transmute(b))
+}
+
+/// Signed Dual Multiply Subtract Reversed.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[1\] - a\[1\] * b\[0\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smusdx))]
+pub unsafe fn __smusdx(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smusdx(transmute(a), transmute(b))
+}
+
+/// Sum of 8-bit absolute differences.
+///
+/// Returns the 8-bit unsigned equivalent of
+///
+/// res = abs(a\[0\] - b\[0\]) + abs(a\[1\] - b\[1\]) +\
+///          (a\[2\] - b\[2\]) + (a\[3\] - b\[3\])
+#[inline]
+#[cfg_attr(test, assert_instr(usad8))]
+pub unsafe fn __usad8(a: int8x4_t, b: int8x4_t) -> u32 {
+    arm_usad8(transmute(a), transmute(b))
+}
+
+/// Sum of 8-bit absolute differences and constant.
+///
+/// Returns the 8-bit unsigned equivalent of
+///
+/// res = abs(a\[0\] - b\[0\]) + abs(a\[1\] - b\[1\]) +\
+///          (a\[2\] - b\[2\]) + (a\[3\] - b\[3\]) + c
+#[inline]
+#[cfg_attr(test, assert_instr(usad8))]
+pub unsafe fn __usada8(a: int8x4_t, b: int8x4_t, c: u32) -> u32 {
+    __usad8(a, b) + c
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::simd::{i16x2, i8x4, u8x4};
+    use std::mem::transmute;
+    use stdarch_test::simd_test;
+
+    #[test]
+    fn qadd8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, i8::MAX);
+            let b = i8x4::new(2, -1, 0, 1);
+            let c = i8x4::new(3, 1, 3, i8::MAX);
+            let r: i8x4 = dsp_call!(super::__qadd8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qsub8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, i8::MIN);
+            let b = i8x4::new(2, -1, 0, 1);
+            let c = i8x4::new(-1, 3, 3, i8::MIN);
+            let r: i8x4 = dsp_call!(super::__qsub8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qadd16() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(2, -1);
+            let c = i16x2::new(3, 1);
+            let r: i16x2 = dsp_call!(super::__qadd16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qsub16() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(20, -10);
+            let c = i16x2::new(-10, 30);
+            let r: i16x2 = dsp_call!(super::__qsub16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qasx() {
+        unsafe {
+            let a = i16x2::new(1, i16::MAX);
+            let b = i16x2::new(2, 2);
+            let c = i16x2::new(-1, i16::MAX);
+            let r: i16x2 = dsp_call!(super::__qasx, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qsax() {
+        unsafe {
+            let a = i16x2::new(1, i16::MAX);
+            let b = i16x2::new(2, 2);
+            let c = i16x2::new(3, i16::MAX - 2);
+            let r: i16x2 = dsp_call!(super::__qsax, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sadd16() {
+        unsafe {
+            let a = i16x2::new(1, i16::MAX);
+            let b = i16x2::new(2, 2);
+            let c = i16x2::new(3, -i16::MAX);
+            let r: i16x2 = dsp_call!(super::__sadd16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sadd8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, i8::MAX);
+            let b = i8x4::new(4, 3, 2, 2);
+            let c = i8x4::new(5, 5, 5, -i8::MAX);
+            let r: i8x4 = dsp_call!(super::__sadd8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sasx() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(2, 1);
+            let c = i16x2::new(0, 4);
+            let r: i16x2 = dsp_call!(super::__sasx, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn smlad() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(3, 4);
+            let r = super::__smlad(transmute(a), transmute(b), 10);
+            assert_eq!(r, (1 * 3) + (2 * 4) + 10);
+        }
+    }
+
+    #[test]
+    fn smlsd() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(3, 4);
+            let r = super::__smlsd(transmute(a), transmute(b), 10);
+            assert_eq!(r, ((1 * 3) - (2 * 4)) + 10);
+        }
+    }
+
+    #[test]
+    fn sel() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, i8::MAX);
+            let b = i8x4::new(4, 3, 2, 2);
+            // call sadd8() to set GE bits
+            super::__sadd8(transmute(a), transmute(b));
+            let c = i8x4::new(1, 2, 3, i8::MAX);
+            let r: i8x4 = dsp_call!(super::__sel, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shadd8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(5, 4, 3, 2);
+            let c = i8x4::new(3, 3, 3, 3);
+            let r: i8x4 = dsp_call!(super::__shadd8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shadd16() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let c = i16x2::new(3, 3);
+            let r: i16x2 = dsp_call!(super::__shadd16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shsub8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(5, 4, 3, 2);
+            let c = i8x4::new(-2, -1, 0, 1);
+            let r: i8x4 = dsp_call!(super::__shsub8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn ssub8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(5, 4, 3, 2);
+            let c = i8x4::new(-4, -2, 0, 2);
+            let r: i8x4 = dsp_call!(super::__ssub8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn usub8() {
+        unsafe {
+            let a = u8x4::new(1, 2, 3, 4);
+            let b = u8x4::new(5, 4, 3, 2);
+            let c = u8x4::new(252, 254, 0, 2);
+            let r: u8x4 = dsp_call!(super::__usub8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shsub16() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let c = i16x2::new(-2, -1);
+            let r: i16x2 = dsp_call!(super::__shsub16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn smuad() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = super::__smuad(transmute(a), transmute(b));
+            assert_eq!(r, 13);
+        }
+    }
+
+    #[test]
+    fn smuadx() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = super::__smuadx(transmute(a), transmute(b));
+            assert_eq!(r, 14);
+        }
+    }
+
+    #[test]
+    fn smusd() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = super::__smusd(transmute(a), transmute(b));
+            assert_eq!(r, -3);
+        }
+    }
+
+    #[test]
+    fn smusdx() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = super::__smusdx(transmute(a), transmute(b));
+            assert_eq!(r, -6);
+        }
+    }
+
+    #[test]
+    fn usad8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(4, 3, 2, 1);
+            let r = super::__usad8(transmute(a), transmute(b));
+            assert_eq!(r, 8);
+        }
+    }
+
+    #[test]
+    fn usad8a() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(4, 3, 2, 1);
+            let c = 10;
+            let r = super::__usada8(transmute(a), transmute(b), c);
+            assert_eq!(r, 8 + c);
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/v6.rs b/library/stdarch/crates/core_arch/src/arm/v6.rs
new file mode 100644
index 000000000..5df30cd62
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/v6.rs
@@ -0,0 +1,49 @@
+//! ARMv6 intrinsics.
+//!
+//! The reference is [ARMv6-M Architecture Reference Manual][armv6m].
+//!
+//! [armv6m]:
+//! http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0419c/index.
+//! html
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Reverse the order of the bytes.
+#[inline]
+#[cfg_attr(test, assert_instr(rev))]
+pub unsafe fn _rev_u16(x: u16) -> u16 {
+    x.swap_bytes() as u16
+}
+
+/// Reverse the order of the bytes.
+#[inline]
+#[cfg_attr(test, assert_instr(rev))]
+pub unsafe fn _rev_u32(x: u32) -> u32 {
+    x.swap_bytes() as u32
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::arm::v6;
+
+    #[test]
+    fn _rev_u16() {
+        unsafe {
+            assert_eq!(
+                v6::_rev_u16(0b0000_0000_1111_1111_u16),
+                0b1111_1111_0000_0000_u16
+            );
+        }
+    }
+
+    #[test]
+    fn _rev_u32() {
+        unsafe {
+            assert_eq!(
+                v6::_rev_u32(0b0000_0000_1111_1111_0000_0000_1111_1111_u32),
+                0b1111_1111_0000_0000_1111_1111_0000_0000_u32
+            );
+        }
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/arm/v7.rs b/library/stdarch/crates/core_arch/src/arm/v7.rs
new file mode 100644
index 000000000..e7507f9b9
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/arm/v7.rs
@@ -0,0 +1,88 @@
+//! ARMv7 intrinsics.
+//!
+//! The reference is [ARMv7-M Architecture Reference Manual (Issue
+//! E.b)][armv7m].
+//!
+//! [armv7m]:
+//! http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0403e.
+//! b/index.html
+
+pub use super::v6::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Count Leading Zeros.
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+// FIXME: https://github.com/rust-lang/stdarch/issues/382
+// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))]
+pub unsafe fn _clz_u8(x: u8) -> u8 {
+    x.leading_zeros() as u8
+}
+
+/// Count Leading Zeros.
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+// FIXME: https://github.com/rust-lang/stdarch/issues/382
+// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))]
+pub unsafe fn _clz_u16(x: u16) -> u16 {
+    x.leading_zeros() as u16
+}
+
+/// Count Leading Zeros.
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))]
+// FIXME: https://github.com/rust-lang/stdarch/issues/382
+// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))]
+pub unsafe fn _clz_u32(x: u32) -> u32 {
+    x.leading_zeros() as u32
+}
+
+/// Reverse the bit order.
+#[inline]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(test, assert_instr(rbit))]
+pub unsafe fn _rbit_u32(x: u32) -> u32 {
+    crate::intrinsics::bitreverse(x)
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::arm::v7;
+
+    #[test]
+    fn _clz_u8() {
+        unsafe {
+            assert_eq!(v7::_clz_u8(0b0000_1010u8), 4u8);
+        }
+    }
+
+    #[test]
+    fn _clz_u16() {
+        unsafe {
+            assert_eq!(v7::_clz_u16(0b0000_1010u16), 12u16);
+        }
+    }
+
+    #[test]
+    fn _clz_u32() {
+        unsafe {
+            assert_eq!(v7::_clz_u32(0b0000_1010u32), 28u32);
+        }
+    }
+
+    #[test]
+    #[cfg(dont_compile_me)] // FIXME need to add `v7` upstream in rustc
+    fn _rbit_u32() {
+        unsafe {
+            assert_eq!(
+                v7::_rbit_u32(0b0000_1010u32),
+                0b0101_0000_0000_0000_0000_0000_0000_0000u32
+            );
+        }
+    }
+}