From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- .../stdarch/crates/core_arch/src/arm/armclang.rs | 35 + library/stdarch/crates/core_arch/src/arm/dsp.rs | 384 ++++++ library/stdarch/crates/core_arch/src/arm/ex.rs | 125 ++ library/stdarch/crates/core_arch/src/arm/mod.rs | 113 ++ library/stdarch/crates/core_arch/src/arm/neon.rs | 1369 ++++++++++++++++++++ library/stdarch/crates/core_arch/src/arm/sat.rs | 8 + library/stdarch/crates/core_arch/src/arm/simd32.rs | 728 +++++++++++ library/stdarch/crates/core_arch/src/arm/v6.rs | 49 + library/stdarch/crates/core_arch/src/arm/v7.rs | 88 ++ 9 files changed, 2899 insertions(+) create mode 100644 library/stdarch/crates/core_arch/src/arm/armclang.rs create mode 100644 library/stdarch/crates/core_arch/src/arm/dsp.rs create mode 100644 library/stdarch/crates/core_arch/src/arm/ex.rs create mode 100644 library/stdarch/crates/core_arch/src/arm/mod.rs create mode 100644 library/stdarch/crates/core_arch/src/arm/neon.rs create mode 100644 library/stdarch/crates/core_arch/src/arm/sat.rs create mode 100644 library/stdarch/crates/core_arch/src/arm/simd32.rs create mode 100644 library/stdarch/crates/core_arch/src/arm/v6.rs create mode 100644 library/stdarch/crates/core_arch/src/arm/v7.rs (limited to 'library/stdarch/crates/core_arch/src/arm') diff --git a/library/stdarch/crates/core_arch/src/arm/armclang.rs b/library/stdarch/crates/core_arch/src/arm/armclang.rs new file mode 100644 index 000000000..e68c02d02 --- /dev/null +++ b/library/stdarch/crates/core_arch/src/arm/armclang.rs @@ -0,0 +1,35 @@ +//! ARM compiler specific intrinsics +//! +//! # References +//! +//! - [ARM Compiler v 6.10 - armclang Reference Guide][arm_comp_ref] +//! +//! [arm_comp_ref]: https://developer.arm.com/docs/100067/0610 + +#[cfg(test)] +use stdarch_test::assert_instr; + +/// Inserts a breakpoint instruction. +/// +/// `VAL` is a compile-time constant integer in range `[0, 255]`. +/// +/// The breakpoint instruction inserted is `BKPT` on A32/T32. +/// +/// # Note +/// +/// [ARM's documentation][arm_docs] defines that `__breakpoint` accepts the +/// following values for `VAL`: +/// +/// - `0...65535` when compiling as A32, +/// - `0...255` when compiling as T32. +/// +/// The current implementation only accepts values in range `[0, 255]`. +/// +/// [arm_docs]: https://developer.arm.com/docs/100067/latest/compiler-specific-intrinsics/__breakpoint-intrinsic +#[cfg_attr(test, assert_instr(bkpt, VAL = 0))] +#[inline(always)] +#[rustc_legacy_const_generics(0)] +pub unsafe fn __breakpoint() { + static_assert_imm8!(VAL); + crate::arch::asm!("bkpt #{}", const VAL); +} diff --git a/library/stdarch/crates/core_arch/src/arm/dsp.rs b/library/stdarch/crates/core_arch/src/arm/dsp.rs new file mode 100644 index 000000000..6720f97a5 --- /dev/null +++ b/library/stdarch/crates/core_arch/src/arm/dsp.rs @@ -0,0 +1,384 @@ +//! # References: +//! +//! - Section 8.3 "16-bit multiplications" +//! +//! Intrinsics that could live here: +//! +//! - \[x\] __smulbb +//! - \[x\] __smulbt +//! - \[x\] __smultb +//! - \[x\] __smultt +//! - \[x\] __smulwb +//! - \[x\] __smulwt +//! - \[x\] __qadd +//! - \[x\] __qsub +//! - \[x\] __qdbl +//! - \[x\] __smlabb +//! - \[x\] __smlabt +//! - \[x\] __smlatb +//! - \[x\] __smlatt +//! - \[x\] __smlawb +//! - \[x\] __smlawt + +#[cfg(test)] +use stdarch_test::assert_instr; + +use crate::mem::transmute; + +types! { + /// ARM-specific 32-bit wide vector of two packed `i16`. + pub struct int16x2_t(i16, i16); + /// ARM-specific 32-bit wide vector of two packed `u16`. + pub struct uint16x2_t(u16, u16); +} + +extern "unadjusted" { + #[link_name = "llvm.arm.smulbb"] + fn arm_smulbb(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.smulbt"] + fn arm_smulbt(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.smultb"] + fn arm_smultb(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.smultt"] + fn arm_smultt(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.smulwb"] + fn arm_smulwb(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.smulwt"] + fn arm_smulwt(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.qadd"] + fn arm_qadd(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.qsub"] + fn arm_qsub(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.smlabb"] + fn arm_smlabb(a: i32, b: i32, c: i32) -> i32; + + #[link_name = "llvm.arm.smlabt"] + fn arm_smlabt(a: i32, b: i32, c: i32) -> i32; + + #[link_name = "llvm.arm.smlatb"] + fn arm_smlatb(a: i32, b: i32, c: i32) -> i32; + + #[link_name = "llvm.arm.smlatt"] + fn arm_smlatt(a: i32, b: i32, c: i32) -> i32; + + #[link_name = "llvm.arm.smlawb"] + fn arm_smlawb(a: i32, b: i32, c: i32) -> i32; + + #[link_name = "llvm.arm.smlawt"] + fn arm_smlawt(a: i32, b: i32, c: i32) -> i32; +} + +/// Insert a SMULBB instruction +/// +/// Returns the equivalent of a\[0\] * b\[0\] +/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits. +#[inline] +#[cfg_attr(test, assert_instr(smulbb))] +pub unsafe fn __smulbb(a: int16x2_t, b: int16x2_t) -> i32 { + arm_smulbb(transmute(a), transmute(b)) +} + +/// Insert a SMULTB instruction +/// +/// Returns the equivalent of a\[0\] * b\[1\] +/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits. +#[inline] +#[cfg_attr(test, assert_instr(smultb))] +pub unsafe fn __smultb(a: int16x2_t, b: int16x2_t) -> i32 { + arm_smultb(transmute(a), transmute(b)) +} + +/// Insert a SMULTB instruction +/// +/// Returns the equivalent of a\[1\] * b\[0\] +/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits. +#[inline] +#[cfg_attr(test, assert_instr(smulbt))] +pub unsafe fn __smulbt(a: int16x2_t, b: int16x2_t) -> i32 { + arm_smulbt(transmute(a), transmute(b)) +} + +/// Insert a SMULTT instruction +/// +/// Returns the equivalent of a\[1\] * b\[1\] +/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits. +#[inline] +#[cfg_attr(test, assert_instr(smultt))] +pub unsafe fn __smultt(a: int16x2_t, b: int16x2_t) -> i32 { + arm_smultt(transmute(a), transmute(b)) +} + +/// Insert a SMULWB instruction +/// +/// Multiplies the 32-bit signed first operand with the low halfword +/// (as a 16-bit signed integer) of the second operand. +/// Return the top 32 bits of the 48-bit product +#[inline] +#[cfg_attr(test, assert_instr(smulwb))] +pub unsafe fn __smulwb(a: int16x2_t, b: i32) -> i32 { + arm_smulwb(transmute(a), b) +} + +/// Insert a SMULWT instruction +/// +/// Multiplies the 32-bit signed first operand with the high halfword +/// (as a 16-bit signed integer) of the second operand. +/// Return the top 32 bits of the 48-bit product +#[inline] +#[cfg_attr(test, assert_instr(smulwt))] +pub unsafe fn __smulwt(a: int16x2_t, b: i32) -> i32 { + arm_smulwt(transmute(a), b) +} + +/// Signed saturating addition +/// +/// Returns the 32-bit saturating signed equivalent of a + b. +/// Sets the Q flag if saturation occurs. +#[inline] +#[cfg_attr(test, assert_instr(qadd))] +pub unsafe fn __qadd(a: i32, b: i32) -> i32 { + arm_qadd(a, b) +} + +/// Signed saturating subtraction +/// +/// Returns the 32-bit saturating signed equivalent of a - b. +/// Sets the Q flag if saturation occurs. +#[inline] +#[cfg_attr(test, assert_instr(qsub))] +pub unsafe fn __qsub(a: i32, b: i32) -> i32 { + arm_qsub(a, b) +} + +/// Insert a QADD instruction +/// +/// Returns the 32-bit saturating signed equivalent of a + a +/// Sets the Q flag if saturation occurs. +#[inline] +#[cfg_attr(test, assert_instr(qadd))] +pub unsafe fn __qdbl(a: i32) -> i32 { + arm_qadd(a, a) +} + +/// Insert a SMLABB instruction +/// +/// Returns the equivalent of a\[0\] * b\[0\] + c +/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits. +/// Sets the Q flag if overflow occurs on the addition. +#[inline] +#[cfg_attr(test, assert_instr(smlabb))] +pub unsafe fn __smlabb(a: int16x2_t, b: int16x2_t, c: i32) -> i32 { + arm_smlabb(transmute(a), transmute(b), c) +} + +/// Insert a SMLABT instruction +/// +/// Returns the equivalent of a\[0\] * b\[1\] + c +/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits. +/// Sets the Q flag if overflow occurs on the addition. +#[inline] +#[cfg_attr(test, assert_instr(smlabt))] +pub unsafe fn __smlabt(a: int16x2_t, b: int16x2_t, c: i32) -> i32 { + arm_smlabt(transmute(a), transmute(b), c) +} + +/// Insert a SMLATB instruction +/// +/// Returns the equivalent of a\[1\] * b\[0\] + c +/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits. +/// Sets the Q flag if overflow occurs on the addition. +#[inline] +#[cfg_attr(test, assert_instr(smlatb))] +pub unsafe fn __smlatb(a: int16x2_t, b: int16x2_t, c: i32) -> i32 { + arm_smlatb(transmute(a), transmute(b), c) +} + +/// Insert a SMLATT instruction +/// +/// Returns the equivalent of a\[1\] * b\[1\] + c +/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits. +/// Sets the Q flag if overflow occurs on the addition. +#[inline] +#[cfg_attr(test, assert_instr(smlatt))] +pub unsafe fn __smlatt(a: int16x2_t, b: int16x2_t, c: i32) -> i32 { + arm_smlatt(transmute(a), transmute(b), c) +} + +/// Insert a SMLAWB instruction +/// +/// Returns the equivalent of (a * b\[0\] + (c << 16)) >> 16 +/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits. +/// Sets the Q flag if overflow occurs on the addition. +#[inline] +#[cfg_attr(test, assert_instr(smlawb))] +pub unsafe fn __smlawb(a: i32, b: int16x2_t, c: i32) -> i32 { + arm_smlawb(a, transmute(b), c) +} + +/// Insert a SMLAWT instruction +/// +/// Returns the equivalent of (a * b\[1\] + (c << 16)) >> 16 +/// where \[0\] is the lower 16 bits and \[1\] is the upper 16 bits. +/// Sets the Q flag if overflow occurs on the addition. +#[inline] +#[cfg_attr(test, assert_instr(smlawt))] +pub unsafe fn __smlawt(a: i32, b: int16x2_t, c: i32) -> i32 { + arm_smlawt(a, transmute(b), c) +} + +#[cfg(test)] +mod tests { + use crate::core_arch::{ + arm::*, + simd::{i16x2, i8x4, u8x4}, + }; + use std::mem::transmute; + use stdarch_test::simd_test; + + #[test] + fn smulbb() { + unsafe { + let a = i16x2::new(10, 20); + let b = i16x2::new(30, 40); + assert_eq!(super::__smulbb(transmute(a), transmute(b)), 10 * 30); + } + } + + #[test] + fn smulbt() { + unsafe { + let a = i16x2::new(10, 20); + let b = i16x2::new(30, 40); + assert_eq!(super::__smulbt(transmute(a), transmute(b)), 10 * 40); + } + } + + #[test] + fn smultb() { + unsafe { + let a = i16x2::new(10, 20); + let b = i16x2::new(30, 40); + assert_eq!(super::__smultb(transmute(a), transmute(b)), 20 * 30); + } + } + + #[test] + fn smultt() { + unsafe { + let a = i16x2::new(10, 20); + let b = i16x2::new(30, 40); + assert_eq!(super::__smultt(transmute(a), transmute(b)), 20 * 40); + } + } + + #[test] + fn smulwb() { + unsafe { + let a = i16x2::new(10, 20); + let b = 30; + assert_eq!(super::__smulwb(transmute(a), b), 20 * b); + } + } + + #[test] + fn smulwt() { + unsafe { + let a = i16x2::new(10, 20); + let b = 30; + assert_eq!(super::__smulwt(transmute(a), b), (10 * b) >> 16); + } + } + + #[test] + fn qadd() { + unsafe { + assert_eq!(super::__qadd(-10, 60), 50); + assert_eq!(super::__qadd(i32::MAX, 10), i32::MAX); + assert_eq!(super::__qadd(i32::MIN, -10), i32::MIN); + } + } + + #[test] + fn qsub() { + unsafe { + assert_eq!(super::__qsub(10, 60), -50); + assert_eq!(super::__qsub(i32::MAX, -10), i32::MAX); + assert_eq!(super::__qsub(i32::MIN, 10), i32::MIN); + } + } + + fn qdbl() { + unsafe { + assert_eq!(super::__qdbl(10), 20); + assert_eq!(super::__qdbl(i32::MAX), i32::MAX); + } + } + + fn smlabb() { + unsafe { + let a = i16x2::new(10, 20); + let b = i16x2::new(30, 40); + let c = 50; + let r = (10 * 30) + c; + assert_eq!(super::__smlabb(transmute(a), transmute(b), c), r); + } + } + + fn smlabt() { + unsafe { + let a = i16x2::new(10, 20); + let b = i16x2::new(30, 40); + let c = 50; + let r = (10 * 40) + c; + assert_eq!(super::__smlabt(transmute(a), transmute(b), c), r); + } + } + + fn smlatb() { + unsafe { + let a = i16x2::new(10, 20); + let b = i16x2::new(30, 40); + let c = 50; + let r = (20 * 30) + c; + assert_eq!(super::__smlabt(transmute(a), transmute(b), c), r); + } + } + + fn smlatt() { + unsafe { + let a = i16x2::new(10, 20); + let b = i16x2::new(30, 40); + let c = 50; + let r = (20 * 40) + c; + assert_eq!(super::__smlatt(transmute(a), transmute(b), c), r); + } + } + + fn smlawb() { + unsafe { + let a: i32 = 10; + let b = i16x2::new(30, 40); + let c: i32 = 50; + let r: i32 = ((a * 30) + (c << 16)) >> 16; + assert_eq!(super::__smlawb(a, transmute(b), c), r); + } + } + + fn smlawt() { + unsafe { + let a: i32 = 10; + let b = i16x2::new(30, 40); + let c: i32 = 50; + let r: i32 = ((a * 40) + (c << 16)) >> 16; + assert_eq!(super::__smlawt(a, transmute(b), c), r); + } + } +} diff --git a/library/stdarch/crates/core_arch/src/arm/ex.rs b/library/stdarch/crates/core_arch/src/arm/ex.rs new file mode 100644 index 000000000..75f378642 --- /dev/null +++ b/library/stdarch/crates/core_arch/src/arm/ex.rs @@ -0,0 +1,125 @@ +// Reference: Section 5.4.4 "LDREX / STREX" of ACLE + +/// Removes the exclusive lock created by LDREX +// Supported: v6, v6K, v7-M, v7-A, v7-R +// Not supported: v5, v6-M +// NOTE: there's no dedicated CLREX instruction in v6 ( u8 { + extern "unadjusted" { + #[link_name = "llvm.arm.ldrex.p0i8"] + fn ldrex8(p: *const u8) -> u32; + } + + ldrex8(p) as u8 +} + +/// Executes an exclusive LDR instruction for 16 bit value. +// Supported: v6K, v7-M, v7-A, v7-R, v8 +// Not supported: v5, v6, v6-M +#[cfg(any( + target_feature = "v6k", // includes v7-M but excludes v6-M + doc +))] +pub unsafe fn __ldrexh(p: *const u16) -> u16 { + extern "unadjusted" { + #[link_name = "llvm.arm.ldrex.p0i16"] + fn ldrex16(p: *const u16) -> u32; + } + + ldrex16(p) as u16 +} + +/// Executes an exclusive LDR instruction for 32 bit value. +// Supported: v6, v7-M, v6K, v7-A, v7-R, v8 +// Not supported: v5, v6-M +#[cfg(any( + all(target_feature = "v6", not(target_feature = "mclass")), // excludes v6-M + all(target_feature = "v7", target_feature = "mclass"), // v7-M + doc +))] +pub unsafe fn __ldrex(p: *const u32) -> u32 { + extern "unadjusted" { + #[link_name = "llvm.arm.ldrex.p0i32"] + fn ldrex32(p: *const u32) -> u32; + } + + ldrex32(p) +} + +/// Executes an exclusive STR instruction for 8 bit values +/// +/// Returns `0` if the operation succeeded, or `1` if it failed +// supported: v6K, v7-M, v7-A, v7-R +// Not supported: v5, v6, v6-M +#[cfg(any( + target_feature = "v6k", // includes v7-M but excludes v6-M + doc +))] +pub unsafe fn __strexb(value: u32, addr: *mut u8) -> u32 { + extern "unadjusted" { + #[link_name = "llvm.arm.strex.p0i8"] + fn strex8(value: u32, addr: *mut u8) -> u32; + } + + strex8(value, addr) +} + +/// Executes an exclusive STR instruction for 16 bit values +/// +/// Returns `0` if the operation succeeded, or `1` if it failed +// Supported: v6K, v7-M, v7-A, v7-R, v8 +// Not supported: v5, v6, v6-M +#[cfg(target_feature = "aarch64")] +#[cfg(any( + target_feature = "v6k", // includes v7-M but excludes v6-M + doc +))] +pub unsafe fn __strexh(value: u16, addr: *mut u16) -> u32 { + extern "unadjusted" { + #[link_name = "llvm.arm.strex.p0i16"] + fn strex16(value: u32, addr: *mut u16) -> u32; + } + + strex16(value as u32, addr) +} + +/// Executes an exclusive STR instruction for 32 bit values +/// +/// Returns `0` if the operation succeeded, or `1` if it failed +// Supported: v6, v7-M, v6K, v7-A, v7-R, v8 +// Not supported: v5, v6-M +#[cfg(any( + all(target_feature = "v6", not(target_feature = "mclass")), // excludes v6-M + all(target_feature = "v7", target_feature = "mclass"), // v7-M + doc +))] +pub unsafe fn __strex(value: u32, addr: *mut u32) -> u32 { + extern "unadjusted" { + #[link_name = "llvm.arm.strex.p0i32"] + fn strex32(value: u32, addr: *mut u32) -> u32; + } + + strex32(value, addr) +} diff --git a/library/stdarch/crates/core_arch/src/arm/mod.rs b/library/stdarch/crates/core_arch/src/arm/mod.rs new file mode 100644 index 000000000..efe0068d4 --- /dev/null +++ b/library/stdarch/crates/core_arch/src/arm/mod.rs @@ -0,0 +1,113 @@ +//! ARM intrinsics. +//! +//! The reference for NEON is [ARM's NEON Intrinsics Reference][arm_ref]. The +//! [ARM's NEON Intrinsics Online Database][arm_dat] is also useful. +//! +//! [arm_ref]: http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf +//! [arm_dat]: https://developer.arm.com/technologies/neon/intrinsics + +mod armclang; +pub use self::armclang::*; + +mod v6; +pub use self::v6::*; + +// Supported arches: 6, 7-M. See Section 10.1 of ACLE (e.g. SSAT) +#[cfg(any(target_feature = "v6", doc))] +mod sat; + +#[cfg(any(target_feature = "v6", doc))] +pub use self::sat::*; + +// Supported arches: 5TE, 7E-M. See Section 10.1 of ACLE (e.g. QADD) +// We also include the A profile even though DSP is deprecated on that profile as of ACLE 2.0 (see +// section 5.4.7) +// Here we workaround the difference between LLVM's +dsp and ACLE's __ARM_FEATURE_DSP by gating on +// '+v5te' rather than on '+dsp' +#[cfg(any( + // >= v5TE but excludes v7-M + all(target_feature = "v5te", not(target_feature = "mclass")), + // v7E-M + all(target_feature = "mclass", target_feature = "dsp"), + doc, +))] +pub mod dsp; + +#[cfg(any( + // >= v5TE but excludes v7-M + all(target_feature = "v5te", not(target_feature = "mclass")), + // v7E-M + all(target_feature = "mclass", target_feature = "dsp"), + doc, +))] +pub use self::dsp::*; + +// Deprecated in ACLE 2.0 for the A profile but fully supported on the M and R profiles, says +// Section 5.4.9 of ACLE. We'll expose these for the A profile even if deprecated +#[cfg(any( + // v7-A, v7-R + all(target_feature = "v6", not(target_feature = "mclass")), + // v7E-M + all(target_feature = "mclass", target_feature = "dsp"), + doc, +))] +mod simd32; + +#[cfg(any( + // v7-A, v7-R + all(target_feature = "v6", not(target_feature = "mclass")), + // v7E-M + all(target_feature = "mclass", target_feature = "dsp"), + doc, +))] +pub use self::simd32::*; + +#[cfg(any(target_feature = "v7", doc))] +mod v7; +#[cfg(any(target_feature = "v7", doc))] +pub use self::v7::*; + +mod ex; +pub use self::ex::*; + +pub use crate::core_arch::arm_shared::*; + +#[cfg(test)] +use stdarch_test::assert_instr; + +#[cfg(any(target_feature = "v7", doc))] +pub(crate) mod neon; +#[cfg(any(target_feature = "v7", doc))] +pub use neon::*; + +/// Generates the trap instruction `UDF` +#[cfg(target_arch = "arm")] +#[cfg_attr(test, assert_instr(udf))] +#[inline] +pub unsafe fn udf() -> ! { + crate::intrinsics::abort() +} + +/// Generates a DBG instruction. +/// +/// This provides a hint to debugging and related systems. The argument must be +/// a constant integer from 0 to 15 inclusive. See implementation documentation +/// for the effect (if any) of this instruction and the meaning of the +/// argument. This is available only when compiling for AArch32. +// Section 10.1 of ACLE says that the supported arches are: 7, 7-M +// "The DBG hint instruction is added in ARMv7. It is UNDEFINED in the ARMv6 base architecture, and +// executes as a NOP instruction in ARMv6K and ARMv6T2." - ARM Architecture Reference Manual ARMv7-A +// and ARMv7-R edition (ARM DDI 0406C.c) sections D12.4.1 "ARM instruction set support" and D12.4.2 +// "Thumb instruction set support" +#[cfg(any(target_feature = "v7", doc))] +#[inline(always)] +#[rustc_legacy_const_generics(0)] +pub unsafe fn __dbg() { + static_assert_imm4!(IMM4); + dbg(IMM4); +} + +extern "unadjusted" { + #[link_name = "llvm.arm.dbg"] + fn dbg(_: i32); +} diff --git a/library/stdarch/crates/core_arch/src/arm/neon.rs b/library/stdarch/crates/core_arch/src/arm/neon.rs new file mode 100644 index 000000000..a0ad92c33 --- /dev/null +++ b/library/stdarch/crates/core_arch/src/arm/neon.rs @@ -0,0 +1,1369 @@ +use crate::core_arch::arm_shared::neon::*; +use crate::core_arch::simd::{f32x4, i32x4, u32x4}; +use crate::core_arch::simd_llvm::*; +use crate::mem::{align_of, transmute}; + +#[cfg(test)] +use stdarch_test::assert_instr; + +#[allow(non_camel_case_types)] +pub(crate) type p8 = u8; +#[allow(non_camel_case_types)] +pub(crate) type p16 = u16; + +#[allow(improper_ctypes)] +extern "unadjusted" { + #[link_name = "llvm.arm.neon.vbsl.v8i8"] + fn vbsl_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t; + #[link_name = "llvm.arm.neon.vbsl.v16i8"] + fn vbslq_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t; + #[link_name = "llvm.arm.neon.vpadals.v4i16.v8i8"] + pub(crate) fn vpadal_s8_(a: int16x4_t, b: int8x8_t) -> int16x4_t; + #[link_name = "llvm.arm.neon.vpadals.v2i32.v4i16"] + pub(crate) fn vpadal_s16_(a: int32x2_t, b: int16x4_t) -> int32x2_t; + #[link_name = "llvm.arm.neon.vpadals.v1i64.v2i32"] + pub(crate) fn vpadal_s32_(a: int64x1_t, b: int32x2_t) -> int64x1_t; + #[link_name = "llvm.arm.neon.vpadals.v8i16.v16i8"] + pub(crate) fn vpadalq_s8_(a: int16x8_t, b: int8x16_t) -> int16x8_t; + #[link_name = "llvm.arm.neon.vpadals.v4i32.v8i16"] + pub(crate) fn vpadalq_s16_(a: int32x4_t, b: int16x8_t) -> int32x4_t; + #[link_name = "llvm.arm.neon.vpadals.v2i64.v4i32"] + pub(crate) fn vpadalq_s32_(a: int64x2_t, b: int32x4_t) -> int64x2_t; + + #[link_name = "llvm.arm.neon.vpadalu.v4i16.v8i8"] + pub(crate) fn vpadal_u8_(a: uint16x4_t, b: uint8x8_t) -> uint16x4_t; + #[link_name = "llvm.arm.neon.vpadalu.v2i32.v4i16"] + pub(crate) fn vpadal_u16_(a: uint32x2_t, b: uint16x4_t) -> uint32x2_t; + #[link_name = "llvm.arm.neon.vpadalu.v1i64.v2i32"] + pub(crate) fn vpadal_u32_(a: uint64x1_t, b: uint32x2_t) -> uint64x1_t; + #[link_name = "llvm.arm.neon.vpadalu.v8i16.v16i8"] + pub(crate) fn vpadalq_u8_(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t; + #[link_name = "llvm.arm.neon.vpadalu.v4i32.v8i16"] + pub(crate) fn vpadalq_u16_(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t; + #[link_name = "llvm.arm.neon.vpadalu.v2i64.v4i32"] + pub(crate) fn vpadalq_u32_(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t; + + #[link_name = "llvm.arm.neon.vtbl1"] + fn vtbl1(a: int8x8_t, b: int8x8_t) -> int8x8_t; + #[link_name = "llvm.arm.neon.vtbl2"] + fn vtbl2(a: int8x8_t, b: int8x8_t, b: int8x8_t) -> int8x8_t; + #[link_name = "llvm.arm.neon.vtbl3"] + fn vtbl3(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t; + #[link_name = "llvm.arm.neon.vtbl4"] + fn vtbl4(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t) -> int8x8_t; + + #[link_name = "llvm.arm.neon.vtbx1"] + fn vtbx1(a: int8x8_t, b: int8x8_t, b: int8x8_t) -> int8x8_t; + #[link_name = "llvm.arm.neon.vtbx2"] + fn vtbx2(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t; + #[link_name = "llvm.arm.neon.vtbx3"] + fn vtbx3(a: int8x8_t, b: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t) -> int8x8_t; + #[link_name = "llvm.arm.neon.vtbx4"] + fn vtbx4( + a: int8x8_t, + b: int8x8_t, + b: int8x8_t, + c: int8x8_t, + d: int8x8_t, + e: int8x8_t, + ) -> int8x8_t; + + #[link_name = "llvm.arm.neon.vshiftins.v8i8"] + fn vshiftins_v8i8(a: int8x8_t, b: int8x8_t, shift: int8x8_t) -> int8x8_t; + #[link_name = "llvm.arm.neon.vshiftins.v16i8"] + fn vshiftins_v16i8(a: int8x16_t, b: int8x16_t, shift: int8x16_t) -> int8x16_t; + #[link_name = "llvm.arm.neon.vshiftins.v4i16"] + fn vshiftins_v4i16(a: int16x4_t, b: int16x4_t, shift: int16x4_t) -> int16x4_t; + #[link_name = "llvm.arm.neon.vshiftins.v8i16"] + fn vshiftins_v8i16(a: int16x8_t, b: int16x8_t, shift: int16x8_t) -> int16x8_t; + #[link_name = "llvm.arm.neon.vshiftins.v2i32"] + fn vshiftins_v2i32(a: int32x2_t, b: int32x2_t, shift: int32x2_t) -> int32x2_t; + #[link_name = "llvm.arm.neon.vshiftins.v4i32"] + fn vshiftins_v4i32(a: int32x4_t, b: int32x4_t, shift: int32x4_t) -> int32x4_t; + #[link_name = "llvm.arm.neon.vshiftins.v1i64"] + fn vshiftins_v1i64(a: int64x1_t, b: int64x1_t, shift: int64x1_t) -> int64x1_t; + #[link_name = "llvm.arm.neon.vshiftins.v2i64"] + fn vshiftins_v2i64(a: int64x2_t, b: int64x2_t, shift: int64x2_t) -> int64x2_t; + + #[link_name = "llvm.arm.neon.vld1.v8i8.p0i8"] + fn vld1_v8i8(addr: *const i8, align: i32) -> int8x8_t; + #[link_name = "llvm.arm.neon.vld1.v16i8.p0i8"] + fn vld1q_v16i8(addr: *const i8, align: i32) -> int8x16_t; + #[link_name = "llvm.arm.neon.vld1.v4i16.p0i8"] + fn vld1_v4i16(addr: *const i8, align: i32) -> int16x4_t; + #[link_name = "llvm.arm.neon.vld1.v8i16.p0i8"] + fn vld1q_v8i16(addr: *const i8, align: i32) -> int16x8_t; + #[link_name = "llvm.arm.neon.vld1.v2i32.p0i8"] + fn vld1_v2i32(addr: *const i8, align: i32) -> int32x2_t; + #[link_name = "llvm.arm.neon.vld1.v4i32.p0i8"] + fn vld1q_v4i32(addr: *const i8, align: i32) -> int32x4_t; + #[link_name = "llvm.arm.neon.vld1.v1i64.p0i8"] + fn vld1_v1i64(addr: *const i8, align: i32) -> int64x1_t; + #[link_name = "llvm.arm.neon.vld1.v2i64.p0i8"] + fn vld1q_v2i64(addr: *const i8, align: i32) -> int64x2_t; + #[link_name = "llvm.arm.neon.vld1.v2f32.p0i8"] + fn vld1_v2f32(addr: *const i8, align: i32) -> float32x2_t; + #[link_name = "llvm.arm.neon.vld1.v4f32.p0i8"] + fn vld1q_v4f32(addr: *const i8, align: i32) -> float32x4_t; + + #[link_name = "llvm.arm.neon.vst1.p0i8.v8i8"] + fn vst1_v8i8(addr: *const i8, val: int8x8_t, align: i32); + #[link_name = "llvm.arm.neon.vst1.p0i8.v16i8"] + fn vst1q_v16i8(addr: *const i8, val: int8x16_t, align: i32); + #[link_name = "llvm.arm.neon.vst1.p0i8.v4i16"] + fn vst1_v4i16(addr: *const i8, val: int16x4_t, align: i32); + #[link_name = "llvm.arm.neon.vst1.p0i8.v8i16"] + fn vst1q_v8i16(addr: *const i8, val: int16x8_t, align: i32); + #[link_name = "llvm.arm.neon.vst1.p0i8.v2i32"] + fn vst1_v2i32(addr: *const i8, val: int32x2_t, align: i32); + #[link_name = "llvm.arm.neon.vst1.p0i8.v4i32"] + fn vst1q_v4i32(addr: *const i8, val: int32x4_t, align: i32); + #[link_name = "llvm.arm.neon.vst1.p0i8.v1i64"] + fn vst1_v1i64(addr: *const i8, val: int64x1_t, align: i32); + #[link_name = "llvm.arm.neon.vst1.p0i8.v2i64"] + fn vst1q_v2i64(addr: *const i8, val: int64x2_t, align: i32); + #[link_name = "llvm.arm.neon.vst1.p0i8.v2f32"] + fn vst1_v2f32(addr: *const i8, val: float32x2_t, align: i32); + #[link_name = "llvm.arm.neon.vst1.p0i8.v4f32"] + fn vst1q_v4f32(addr: *const i8, val: float32x4_t, align: i32); +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vld1.8"))] +pub unsafe fn vld1_s8(ptr: *const i8) -> int8x8_t { + vld1_v8i8(ptr as *const i8, align_of::() as i32) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vld1.8"))] +pub unsafe fn vld1q_s8(ptr: *const i8) -> int8x16_t { + vld1q_v16i8(ptr as *const i8, align_of::() as i32) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vld1.16"))] +pub unsafe fn vld1_s16(ptr: *const i16) -> int16x4_t { + vld1_v4i16(ptr as *const i8, align_of::() as i32) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vld1.16"))] +pub unsafe fn vld1q_s16(ptr: *const i16) -> int16x8_t { + vld1q_v8i16(ptr as *const i8, align_of::() as i32) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vldr))] +pub unsafe fn vld1_s32(ptr: *const i32) -> int32x2_t { + vld1_v2i32(ptr as *const i8, align_of::() as i32) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vld1.32"))] +pub unsafe fn vld1q_s32(ptr: *const i32) -> int32x4_t { + vld1q_v4i32(ptr as *const i8, align_of::() as i32) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vldr))] +pub unsafe fn vld1_s64(ptr: *const i64) -> int64x1_t { + vld1_v1i64(ptr as *const i8, align_of::() as i32) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vld1.64"))] +pub unsafe fn vld1q_s64(ptr: *const i64) -> int64x2_t { + vld1q_v2i64(ptr as *const i8, align_of::() as i32) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vld1.8"))] +pub unsafe fn vld1_u8(ptr: *const u8) -> uint8x8_t { + transmute(vld1_v8i8(ptr as *const i8, align_of::() as i32)) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vld1.8"))] +pub unsafe fn vld1q_u8(ptr: *const u8) -> uint8x16_t { + transmute(vld1q_v16i8(ptr as *const i8, align_of::() as i32)) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vld1.16"))] +pub unsafe fn vld1_u16(ptr: *const u16) -> uint16x4_t { + transmute(vld1_v4i16(ptr as *const i8, align_of::() as i32)) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vld1.16"))] +pub unsafe fn vld1q_u16(ptr: *const u16) -> uint16x8_t { + transmute(vld1q_v8i16(ptr as *const i8, align_of::() as i32)) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vldr))] +pub unsafe fn vld1_u32(ptr: *const u32) -> uint32x2_t { + transmute(vld1_v2i32(ptr as *const i8, align_of::() as i32)) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vld1.32"))] +pub unsafe fn vld1q_u32(ptr: *const u32) -> uint32x4_t { + transmute(vld1q_v4i32(ptr as *const i8, align_of::() as i32)) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vldr))] +pub unsafe fn vld1_u64(ptr: *const u64) -> uint64x1_t { + transmute(vld1_v1i64(ptr as *const i8, align_of::() as i32)) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vld1.64"))] +pub unsafe fn vld1q_u64(ptr: *const u64) -> uint64x2_t { + transmute(vld1q_v2i64(ptr as *const i8, align_of::() as i32)) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vld1.8"))] +pub unsafe fn vld1_p8(ptr: *const p8) -> poly8x8_t { + transmute(vld1_v8i8(ptr as *const i8, align_of::() as i32)) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vld1.8"))] +pub unsafe fn vld1q_p8(ptr: *const p8) -> poly8x16_t { + transmute(vld1q_v16i8(ptr as *const i8, align_of::() as i32)) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vld1.16"))] +pub unsafe fn vld1_p16(ptr: *const p16) -> poly16x4_t { + transmute(vld1_v4i16(ptr as *const i8, align_of::() as i32)) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vld1.16"))] +pub unsafe fn vld1q_p16(ptr: *const p16) -> poly16x8_t { + transmute(vld1q_v8i16(ptr as *const i8, align_of::() as i32)) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(vldr))] +pub unsafe fn vld1_p64(ptr: *const p64) -> poly64x1_t { + transmute(vld1_v1i64(ptr as *const i8, align_of::() as i32)) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr("vld1.64"))] +pub unsafe fn vld1q_p64(ptr: *const p64) -> poly64x2_t { + transmute(vld1q_v2i64(ptr as *const i8, align_of::() as i32)) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vldr))] +pub unsafe fn vld1_f32(ptr: *const f32) -> float32x2_t { + vld1_v2f32(ptr as *const i8, align_of::() as i32) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vld1.32"))] +pub unsafe fn vld1q_f32(ptr: *const f32) -> float32x4_t { + vld1q_v4f32(ptr as *const i8, align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.8"))] +pub unsafe fn vst1_s8(ptr: *mut i8, a: int8x8_t) { + vst1_v8i8(ptr as *const i8, a, align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.8"))] +pub unsafe fn vst1q_s8(ptr: *mut i8, a: int8x16_t) { + vst1q_v16i8(ptr as *const i8, a, align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.16"))] +pub unsafe fn vst1_s16(ptr: *mut i16, a: int16x4_t) { + vst1_v4i16(ptr as *const i8, a, align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.16"))] +pub unsafe fn vst1q_s16(ptr: *mut i16, a: int16x8_t) { + vst1q_v8i16(ptr as *const i8, a, align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.32"))] +pub unsafe fn vst1_s32(ptr: *mut i32, a: int32x2_t) { + vst1_v2i32(ptr as *const i8, a, align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.32"))] +pub unsafe fn vst1q_s32(ptr: *mut i32, a: int32x4_t) { + vst1q_v4i32(ptr as *const i8, a, align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.64"))] +pub unsafe fn vst1_s64(ptr: *mut i64, a: int64x1_t) { + vst1_v1i64(ptr as *const i8, a, align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.64"))] +pub unsafe fn vst1q_s64(ptr: *mut i64, a: int64x2_t) { + vst1q_v2i64(ptr as *const i8, a, align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.8"))] +pub unsafe fn vst1_u8(ptr: *mut u8, a: uint8x8_t) { + vst1_v8i8(ptr as *const i8, transmute(a), align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.8"))] +pub unsafe fn vst1q_u8(ptr: *mut u8, a: uint8x16_t) { + vst1q_v16i8(ptr as *const i8, transmute(a), align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.16"))] +pub unsafe fn vst1_u16(ptr: *mut u16, a: uint16x4_t) { + vst1_v4i16(ptr as *const i8, transmute(a), align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.16"))] +pub unsafe fn vst1q_u16(ptr: *mut u16, a: uint16x8_t) { + vst1q_v8i16(ptr as *const i8, transmute(a), align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.32"))] +pub unsafe fn vst1_u32(ptr: *mut u32, a: uint32x2_t) { + vst1_v2i32(ptr as *const i8, transmute(a), align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.32"))] +pub unsafe fn vst1q_u32(ptr: *mut u32, a: uint32x4_t) { + vst1q_v4i32(ptr as *const i8, transmute(a), align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.64"))] +pub unsafe fn vst1_u64(ptr: *mut u64, a: uint64x1_t) { + vst1_v1i64(ptr as *const i8, transmute(a), align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.64"))] +pub unsafe fn vst1q_u64(ptr: *mut u64, a: uint64x2_t) { + vst1q_v2i64(ptr as *const i8, transmute(a), align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.8"))] +pub unsafe fn vst1_p8(ptr: *mut p8, a: poly8x8_t) { + vst1_v8i8(ptr as *const i8, transmute(a), align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.8"))] +pub unsafe fn vst1q_p8(ptr: *mut p8, a: poly8x16_t) { + vst1q_v16i8(ptr as *const i8, transmute(a), align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.16"))] +pub unsafe fn vst1_p16(ptr: *mut p16, a: poly16x4_t) { + vst1_v4i16(ptr as *const i8, transmute(a), align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.16"))] +pub unsafe fn vst1q_p16(ptr: *mut p16, a: poly16x8_t) { + vst1q_v8i16(ptr as *const i8, transmute(a), align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,aes,v8")] +#[cfg_attr(test, assert_instr("vst1.64"))] +pub unsafe fn vst1_p64(ptr: *mut p64, a: poly64x1_t) { + vst1_v1i64(ptr as *const i8, transmute(a), align_of::() as i32) +} + +/// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,aes,v8")] +#[cfg_attr(test, assert_instr("vst1.64"))] +pub unsafe fn vst1q_p64(ptr: *mut p64, a: poly64x2_t) { + vst1q_v2i64(ptr as *const i8, transmute(a), align_of::() as i32) +} + +// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.32"))] +pub unsafe fn vst1_f32(ptr: *mut f32, a: float32x2_t) { + vst1_v2f32(ptr as *const i8, a, align_of::() as i32) +} + +// Store multiple single-element structures from one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vst1.32"))] +pub unsafe fn vst1q_f32(ptr: *mut f32, a: float32x4_t) { + vst1q_v4f32(ptr as *const i8, a, align_of::() as i32) +} + +/// Table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbl))] +pub unsafe fn vtbl1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t { + vtbl1(a, b) +} + +/// Table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbl))] +pub unsafe fn vtbl1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { + transmute(vtbl1(transmute(a), transmute(b))) +} + +/// Table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbl))] +pub unsafe fn vtbl1_p8(a: poly8x8_t, b: uint8x8_t) -> poly8x8_t { + transmute(vtbl1(transmute(a), transmute(b))) +} + +/// Table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbl))] +pub unsafe fn vtbl2_s8(a: int8x8x2_t, b: int8x8_t) -> int8x8_t { + vtbl2(a.0, a.1, b) +} + +/// Table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbl))] +pub unsafe fn vtbl2_u8(a: uint8x8x2_t, b: uint8x8_t) -> uint8x8_t { + transmute(vtbl2(transmute(a.0), transmute(a.1), transmute(b))) +} + +/// Table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbl))] +pub unsafe fn vtbl2_p8(a: poly8x8x2_t, b: uint8x8_t) -> poly8x8_t { + transmute(vtbl2(transmute(a.0), transmute(a.1), transmute(b))) +} + +/// Table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbl))] +pub unsafe fn vtbl3_s8(a: int8x8x3_t, b: int8x8_t) -> int8x8_t { + vtbl3(a.0, a.1, a.2, b) +} + +/// Table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbl))] +pub unsafe fn vtbl3_u8(a: uint8x8x3_t, b: uint8x8_t) -> uint8x8_t { + transmute(vtbl3( + transmute(a.0), + transmute(a.1), + transmute(a.2), + transmute(b), + )) +} + +/// Table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbl))] +pub unsafe fn vtbl3_p8(a: poly8x8x3_t, b: uint8x8_t) -> poly8x8_t { + transmute(vtbl3( + transmute(a.0), + transmute(a.1), + transmute(a.2), + transmute(b), + )) +} + +/// Table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbl))] +pub unsafe fn vtbl4_s8(a: int8x8x4_t, b: int8x8_t) -> int8x8_t { + vtbl4(a.0, a.1, a.2, a.3, b) +} + +/// Table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbl))] +pub unsafe fn vtbl4_u8(a: uint8x8x4_t, b: uint8x8_t) -> uint8x8_t { + transmute(vtbl4( + transmute(a.0), + transmute(a.1), + transmute(a.2), + transmute(a.3), + transmute(b), + )) +} + +/// Table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbl))] +pub unsafe fn vtbl4_p8(a: poly8x8x4_t, b: uint8x8_t) -> poly8x8_t { + transmute(vtbl4( + transmute(a.0), + transmute(a.1), + transmute(a.2), + transmute(a.3), + transmute(b), + )) +} + +/// Extended table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbx))] +pub unsafe fn vtbx1_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t { + vtbx1(a, b, c) +} + +/// Extended table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbx))] +pub unsafe fn vtbx1_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t { + transmute(vtbx1(transmute(a), transmute(b), transmute(c))) +} + +/// Extended table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbx))] +pub unsafe fn vtbx1_p8(a: poly8x8_t, b: poly8x8_t, c: uint8x8_t) -> poly8x8_t { + transmute(vtbx1(transmute(a), transmute(b), transmute(c))) +} + +/// Extended table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbx))] +pub unsafe fn vtbx2_s8(a: int8x8_t, b: int8x8x2_t, c: int8x8_t) -> int8x8_t { + vtbx2(a, b.0, b.1, c) +} + +/// Extended table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbx))] +pub unsafe fn vtbx2_u8(a: uint8x8_t, b: uint8x8x2_t, c: uint8x8_t) -> uint8x8_t { + transmute(vtbx2( + transmute(a), + transmute(b.0), + transmute(b.1), + transmute(c), + )) +} + +/// Extended table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbx))] +pub unsafe fn vtbx2_p8(a: poly8x8_t, b: poly8x8x2_t, c: uint8x8_t) -> poly8x8_t { + transmute(vtbx2( + transmute(a), + transmute(b.0), + transmute(b.1), + transmute(c), + )) +} + +/// Extended table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbx))] +pub unsafe fn vtbx3_s8(a: int8x8_t, b: int8x8x3_t, c: int8x8_t) -> int8x8_t { + vtbx3(a, b.0, b.1, b.2, c) +} + +/// Extended table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbx))] +pub unsafe fn vtbx3_u8(a: uint8x8_t, b: uint8x8x3_t, c: uint8x8_t) -> uint8x8_t { + transmute(vtbx3( + transmute(a), + transmute(b.0), + transmute(b.1), + transmute(b.2), + transmute(c), + )) +} + +/// Extended table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbx))] +pub unsafe fn vtbx3_p8(a: poly8x8_t, b: poly8x8x3_t, c: uint8x8_t) -> poly8x8_t { + transmute(vtbx3( + transmute(a), + transmute(b.0), + transmute(b.1), + transmute(b.2), + transmute(c), + )) +} + +/// Extended table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbx))] +pub unsafe fn vtbx4_s8(a: int8x8_t, b: int8x8x4_t, c: int8x8_t) -> int8x8_t { + vtbx4(a, b.0, b.1, b.2, b.3, c) +} + +/// Extended table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbx))] +pub unsafe fn vtbx4_u8(a: uint8x8_t, b: uint8x8x4_t, c: uint8x8_t) -> uint8x8_t { + transmute(vtbx4( + transmute(a), + transmute(b.0), + transmute(b.1), + transmute(b.2), + transmute(b.3), + transmute(c), + )) +} + +/// Extended table look-up +#[inline] +#[cfg(target_endian = "little")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr(vtbx))] +pub unsafe fn vtbx4_p8(a: poly8x8_t, b: poly8x8x4_t, c: uint8x8_t) -> poly8x8_t { + transmute(vtbx4( + transmute(a), + transmute(b.0), + transmute(b.1), + transmute(b.2), + transmute(b.3), + transmute(c), + )) +} + +// These float-to-int implementations have undefined behaviour when `a` overflows +// the destination type. Clang has the same problem: https://llvm.org/PR47510 + +/// Floating-point Convert to Signed fixed-point, rounding toward Zero (vector) +#[inline] +#[target_feature(enable = "neon")] +#[target_feature(enable = "v7")] +#[cfg_attr(test, assert_instr("vcvt.s32.f32"))] +pub unsafe fn vcvtq_s32_f32(a: float32x4_t) -> int32x4_t { + transmute(simd_cast::<_, i32x4>(transmute::<_, f32x4>(a))) +} + +/// Floating-point Convert to Unsigned fixed-point, rounding toward Zero (vector) +#[inline] +#[target_feature(enable = "neon")] +#[target_feature(enable = "v7")] +#[cfg_attr(test, assert_instr("vcvt.u32.f32"))] +pub unsafe fn vcvtq_u32_f32(a: float32x4_t) -> uint32x4_t { + transmute(simd_cast::<_, u32x4>(transmute::<_, f32x4>(a))) +} + +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.8", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsli_n_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t { + static_assert_imm3!(N); + let n = N as i8; + vshiftins_v8i8(a, b, int8x8_t(n, n, n, n, n, n, n, n)) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.8", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsliq_n_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { + static_assert_imm3!(N); + let n = N as i8; + vshiftins_v16i8( + a, + b, + int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n), + ) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.16", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsli_n_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { + static_assert_imm4!(N); + let n = N as i16; + vshiftins_v4i16(a, b, int16x4_t(n, n, n, n)) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.16", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsliq_n_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { + static_assert_imm4!(N); + let n = N as i16; + vshiftins_v8i16(a, b, int16x8_t(n, n, n, n, n, n, n, n)) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.32", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsli_n_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { + static_assert!(N: i32 where N >= 0 && N <= 31); + vshiftins_v2i32(a, b, int32x2_t(N, N)) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.32", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsliq_n_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { + static_assert!(N: i32 where N >= 0 && N <= 31); + vshiftins_v4i32(a, b, int32x4_t(N, N, N, N)) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.64", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsli_n_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t { + static_assert!(N : i32 where 0 <= N && N <= 63); + vshiftins_v1i64(a, b, int64x1_t(N as i64)) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.64", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsliq_n_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t { + static_assert!(N : i32 where 0 <= N && N <= 63); + vshiftins_v2i64(a, b, int64x2_t(N as i64, N as i64)) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.8", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsli_n_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { + static_assert_imm3!(N); + let n = N as i8; + transmute(vshiftins_v8i8( + transmute(a), + transmute(b), + int8x8_t(n, n, n, n, n, n, n, n), + )) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.8", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsliq_n_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + static_assert_imm3!(N); + let n = N as i8; + transmute(vshiftins_v16i8( + transmute(a), + transmute(b), + int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n), + )) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.16", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsli_n_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { + static_assert_imm4!(N); + let n = N as i16; + transmute(vshiftins_v4i16( + transmute(a), + transmute(b), + int16x4_t(n, n, n, n), + )) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.16", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsliq_n_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t { + static_assert_imm4!(N); + let n = N as i16; + transmute(vshiftins_v8i16( + transmute(a), + transmute(b), + int16x8_t(n, n, n, n, n, n, n, n), + )) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.32", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsli_n_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { + static_assert!(N: i32 where N >= 0 && N <= 31); + transmute(vshiftins_v2i32(transmute(a), transmute(b), int32x2_t(N, N))) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.32", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsliq_n_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { + static_assert!(N: i32 where N >= 0 && N <= 31); + transmute(vshiftins_v4i32( + transmute(a), + transmute(b), + int32x4_t(N, N, N, N), + )) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.64", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsli_n_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t { + static_assert!(N : i32 where 0 <= N && N <= 63); + transmute(vshiftins_v1i64( + transmute(a), + transmute(b), + int64x1_t(N as i64), + )) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.64", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsliq_n_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { + static_assert!(N : i32 where 0 <= N && N <= 63); + transmute(vshiftins_v2i64( + transmute(a), + transmute(b), + int64x2_t(N as i64, N as i64), + )) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.8", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsli_n_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t { + static_assert_imm3!(N); + let n = N as i8; + transmute(vshiftins_v8i8( + transmute(a), + transmute(b), + int8x8_t(n, n, n, n, n, n, n, n), + )) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.8", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsliq_n_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t { + static_assert_imm3!(N); + let n = N as i8; + transmute(vshiftins_v16i8( + transmute(a), + transmute(b), + int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n), + )) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.16", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsli_n_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t { + static_assert_imm4!(N); + let n = N as i16; + transmute(vshiftins_v4i16( + transmute(a), + transmute(b), + int16x4_t(n, n, n, n), + )) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsli.16", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsliq_n_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t { + static_assert_imm4!(N); + let n = N as i16; + transmute(vshiftins_v8i16( + transmute(a), + transmute(b), + int16x8_t(n, n, n, n, n, n, n, n), + )) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7,aes")] +#[cfg_attr(test, assert_instr("vsli.64", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsli_n_p64(a: poly64x1_t, b: poly64x1_t) -> poly64x1_t { + static_assert!(N : i32 where 0 <= N && N <= 63); + transmute(vshiftins_v1i64( + transmute(a), + transmute(b), + int64x1_t(N as i64), + )) +} +/// Shift Left and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7,aes")] +#[cfg_attr(test, assert_instr("vsli.64", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsliq_n_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t { + static_assert!(N : i32 where 0 <= N && N <= 63); + transmute(vshiftins_v2i64( + transmute(a), + transmute(b), + int64x2_t(N as i64, N as i64), + )) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.8", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsri_n_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t { + static_assert!(N : i32 where 1 <= N && N <= 8); + let n = -N as i8; + vshiftins_v8i8(a, b, int8x8_t(n, n, n, n, n, n, n, n)) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.8", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsriq_n_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { + static_assert!(N : i32 where 1 <= N && N <= 8); + let n = -N as i8; + vshiftins_v16i8( + a, + b, + int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n), + ) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.16", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsri_n_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { + static_assert!(N : i32 where 1 <= N && N <= 16); + let n = -N as i16; + vshiftins_v4i16(a, b, int16x4_t(n, n, n, n)) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.16", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsriq_n_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { + static_assert!(N : i32 where 1 <= N && N <= 16); + let n = -N as i16; + vshiftins_v8i16(a, b, int16x8_t(n, n, n, n, n, n, n, n)) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.32", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsri_n_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { + static_assert!(N : i32 where 1 <= N && N <= 32); + vshiftins_v2i32(a, b, int32x2_t(-N, -N)) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.32", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsriq_n_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { + static_assert!(N : i32 where 1 <= N && N <= 32); + vshiftins_v4i32(a, b, int32x4_t(-N, -N, -N, -N)) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.64", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsri_n_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t { + static_assert!(N : i32 where 1 <= N && N <= 64); + vshiftins_v1i64(a, b, int64x1_t(-N as i64)) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.64", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsriq_n_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t { + static_assert!(N : i32 where 1 <= N && N <= 64); + vshiftins_v2i64(a, b, int64x2_t(-N as i64, -N as i64)) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.8", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsri_n_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { + static_assert!(N : i32 where 1 <= N && N <= 8); + let n = -N as i8; + transmute(vshiftins_v8i8( + transmute(a), + transmute(b), + int8x8_t(n, n, n, n, n, n, n, n), + )) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.8", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsriq_n_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + static_assert!(N : i32 where 1 <= N && N <= 8); + let n = -N as i8; + transmute(vshiftins_v16i8( + transmute(a), + transmute(b), + int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n), + )) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.16", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsri_n_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { + static_assert!(N : i32 where 1 <= N && N <= 16); + let n = -N as i16; + transmute(vshiftins_v4i16( + transmute(a), + transmute(b), + int16x4_t(n, n, n, n), + )) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.16", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsriq_n_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t { + static_assert!(N : i32 where 1 <= N && N <= 16); + let n = -N as i16; + transmute(vshiftins_v8i16( + transmute(a), + transmute(b), + int16x8_t(n, n, n, n, n, n, n, n), + )) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.32", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsri_n_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { + static_assert!(N : i32 where 1 <= N && N <= 32); + transmute(vshiftins_v2i32( + transmute(a), + transmute(b), + int32x2_t(-N, -N), + )) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.32", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsriq_n_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { + static_assert!(N : i32 where 1 <= N && N <= 32); + transmute(vshiftins_v4i32( + transmute(a), + transmute(b), + int32x4_t(-N, -N, -N, -N), + )) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.64", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsri_n_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t { + static_assert!(N : i32 where 1 <= N && N <= 64); + transmute(vshiftins_v1i64( + transmute(a), + transmute(b), + int64x1_t(-N as i64), + )) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.64", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsriq_n_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { + static_assert!(N : i32 where 1 <= N && N <= 64); + transmute(vshiftins_v2i64( + transmute(a), + transmute(b), + int64x2_t(-N as i64, -N as i64), + )) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.8", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsri_n_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t { + static_assert!(N : i32 where 1 <= N && N <= 8); + let n = -N as i8; + transmute(vshiftins_v8i8( + transmute(a), + transmute(b), + int8x8_t(n, n, n, n, n, n, n, n), + )) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.8", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsriq_n_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t { + static_assert!(N : i32 where 1 <= N && N <= 8); + let n = -N as i8; + transmute(vshiftins_v16i8( + transmute(a), + transmute(b), + int8x16_t(n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n), + )) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.16", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsri_n_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t { + static_assert!(N : i32 where 1 <= N && N <= 16); + let n = -N as i16; + transmute(vshiftins_v4i16( + transmute(a), + transmute(b), + int16x4_t(n, n, n, n), + )) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(test, assert_instr("vsri.16", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsriq_n_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t { + static_assert!(N : i32 where 1 <= N && N <= 16); + let n = -N as i16; + transmute(vshiftins_v8i16( + transmute(a), + transmute(b), + int16x8_t(n, n, n, n, n, n, n, n), + )) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7,aes")] +#[cfg_attr(test, assert_instr("vsri.64", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsri_n_p64(a: poly64x1_t, b: poly64x1_t) -> poly64x1_t { + static_assert!(N : i32 where 1 <= N && N <= 64); + transmute(vshiftins_v1i64( + transmute(a), + transmute(b), + int64x1_t(-N as i64), + )) +} +/// Shift Right and Insert (immediate) +#[inline] +#[target_feature(enable = "neon,v7,aes")] +#[cfg_attr(test, assert_instr("vsri.64", N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vsriq_n_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t { + static_assert!(N : i32 where 1 <= N && N <= 64); + transmute(vshiftins_v2i64( + transmute(a), + transmute(b), + int64x2_t(-N as i64, -N as i64), + )) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::core_arch::{arm::*, simd::*}; + use crate::mem::transmute; + use stdarch_test::simd_test; + + #[simd_test(enable = "neon")] + unsafe fn test_vcvtq_s32_f32() { + let f = f32x4::new(-1., 2., 3., 4.); + let e = i32x4::new(-1, 2, 3, 4); + let r: i32x4 = transmute(vcvtq_s32_f32(transmute(f))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vcvtq_u32_f32() { + let f = f32x4::new(1., 2., 3., 4.); + let e = u32x4::new(1, 2, 3, 4); + let r: u32x4 = transmute(vcvtq_u32_f32(transmute(f))); + assert_eq!(r, e); + } +} diff --git a/library/stdarch/crates/core_arch/src/arm/sat.rs b/library/stdarch/crates/core_arch/src/arm/sat.rs new file mode 100644 index 000000000..38c98d734 --- /dev/null +++ b/library/stdarch/crates/core_arch/src/arm/sat.rs @@ -0,0 +1,8 @@ +//! # References: +//! +//! - Section 8.4 "Saturating intrinsics" +//! +//! Intrinsics that could live here: +//! +//! - __ssat +//! - __usat diff --git a/library/stdarch/crates/core_arch/src/arm/simd32.rs b/library/stdarch/crates/core_arch/src/arm/simd32.rs new file mode 100644 index 000000000..2d867acc8 --- /dev/null +++ b/library/stdarch/crates/core_arch/src/arm/simd32.rs @@ -0,0 +1,728 @@ +//! # References +//! +//! - Section 8.5 "32-bit SIMD intrinsics" of ACLE +//! +//! Intrinsics that could live here +//! +//! - \[x\] __sel +//! - \[ \] __ssat16 +//! - \[ \] __usat16 +//! - \[ \] __sxtab16 +//! - \[ \] __sxtb16 +//! - \[ \] __uxtab16 +//! - \[ \] __uxtb16 +//! - \[x\] __qadd8 +//! - \[x\] __qsub8 +//! - \[x\] __sadd8 +//! - \[x\] __shadd8 +//! - \[x\] __shsub8 +//! - \[x\] __ssub8 +//! - \[ \] __uadd8 +//! - \[ \] __uhadd8 +//! - \[ \] __uhsub8 +//! - \[ \] __uqadd8 +//! - \[ \] __uqsub8 +//! - \[x\] __usub8 +//! - \[x\] __usad8 +//! - \[x\] __usada8 +//! - \[x\] __qadd16 +//! - \[x\] __qasx +//! - \[x\] __qsax +//! - \[x\] __qsub16 +//! - \[x\] __sadd16 +//! - \[x\] __sasx +//! - \[x\] __shadd16 +//! - \[ \] __shasx +//! - \[ \] __shsax +//! - \[x\] __shsub16 +//! - \[ \] __ssax +//! - \[ \] __ssub16 +//! - \[ \] __uadd16 +//! - \[ \] __uasx +//! - \[ \] __uhadd16 +//! - \[ \] __uhasx +//! - \[ \] __uhsax +//! - \[ \] __uhsub16 +//! - \[ \] __uqadd16 +//! - \[ \] __uqasx +//! - \[x\] __uqsax +//! - \[ \] __uqsub16 +//! - \[ \] __usax +//! - \[ \] __usub16 +//! - \[x\] __smlad +//! - \[ \] __smladx +//! - \[ \] __smlald +//! - \[ \] __smlaldx +//! - \[x\] __smlsd +//! - \[ \] __smlsdx +//! - \[ \] __smlsld +//! - \[ \] __smlsldx +//! - \[x\] __smuad +//! - \[x\] __smuadx +//! - \[x\] __smusd +//! - \[x\] __smusdx + +#[cfg(test)] +use stdarch_test::assert_instr; + +use crate::{core_arch::arm::dsp::int16x2_t, mem::transmute}; + +types! { + /// ARM-specific 32-bit wide vector of four packed `i8`. + pub struct int8x4_t(i8, i8, i8, i8); + /// ARM-specific 32-bit wide vector of four packed `u8`. + pub struct uint8x4_t(u8, u8, u8, u8); +} + +macro_rules! dsp_call { + ($name:expr, $a:expr, $b:expr) => { + transmute($name(transmute($a), transmute($b))) + }; +} + +extern "unadjusted" { + #[link_name = "llvm.arm.qadd8"] + fn arm_qadd8(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.qsub8"] + fn arm_qsub8(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.qsub16"] + fn arm_qsub16(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.qadd16"] + fn arm_qadd16(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.qasx"] + fn arm_qasx(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.qsax"] + fn arm_qsax(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.sadd16"] + fn arm_sadd16(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.sadd8"] + fn arm_sadd8(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.smlad"] + fn arm_smlad(a: i32, b: i32, c: i32) -> i32; + + #[link_name = "llvm.arm.smlsd"] + fn arm_smlsd(a: i32, b: i32, c: i32) -> i32; + + #[link_name = "llvm.arm.sasx"] + fn arm_sasx(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.sel"] + fn arm_sel(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.shadd8"] + fn arm_shadd8(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.shadd16"] + fn arm_shadd16(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.shsub8"] + fn arm_shsub8(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.ssub8"] + fn arm_ssub8(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.usub8"] + fn arm_usub8(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.shsub16"] + fn arm_shsub16(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.smuad"] + fn arm_smuad(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.smuadx"] + fn arm_smuadx(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.smusd"] + fn arm_smusd(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.smusdx"] + fn arm_smusdx(a: i32, b: i32) -> i32; + + #[link_name = "llvm.arm.usad8"] + fn arm_usad8(a: i32, b: i32) -> u32; +} + +/// Saturating four 8-bit integer additions +/// +/// Returns the 8-bit signed equivalent of +/// +/// res\[0\] = a\[0\] + b\[0\] +/// res\[1\] = a\[1\] + b\[1\] +/// res\[2\] = a\[2\] + b\[2\] +/// res\[3\] = a\[3\] + b\[3\] +#[inline] +#[cfg_attr(test, assert_instr(qadd8))] +pub unsafe fn __qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t { + dsp_call!(arm_qadd8, a, b) +} + +/// Saturating two 8-bit integer subtraction +/// +/// Returns the 8-bit signed equivalent of +/// +/// res\[0\] = a\[0\] - b\[0\] +/// res\[1\] = a\[1\] - b\[1\] +/// res\[2\] = a\[2\] - b\[2\] +/// res\[3\] = a\[3\] - b\[3\] +#[inline] +#[cfg_attr(test, assert_instr(qsub8))] +pub unsafe fn __qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t { + dsp_call!(arm_qsub8, a, b) +} + +/// Saturating two 16-bit integer subtraction +/// +/// Returns the 16-bit signed equivalent of +/// +/// res\[0\] = a\[0\] - b\[0\] +/// res\[1\] = a\[1\] - b\[1\] +#[inline] +#[cfg_attr(test, assert_instr(qsub16))] +pub unsafe fn __qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t { + dsp_call!(arm_qsub16, a, b) +} + +/// Saturating two 16-bit integer additions +/// +/// Returns the 16-bit signed equivalent of +/// +/// res\[0\] = a\[0\] + b\[0\] +/// res\[1\] = a\[1\] + b\[1\] +#[inline] +#[cfg_attr(test, assert_instr(qadd16))] +pub unsafe fn __qadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t { + dsp_call!(arm_qadd16, a, b) +} + +/// Returns the 16-bit signed saturated equivalent of +/// +/// res\[0\] = a\[0\] - b\[1\] +/// res\[1\] = a\[1\] + b\[0\] +#[inline] +#[cfg_attr(test, assert_instr(qasx))] +pub unsafe fn __qasx(a: int16x2_t, b: int16x2_t) -> int16x2_t { + dsp_call!(arm_qasx, a, b) +} + +/// Returns the 16-bit signed saturated equivalent of +/// +/// res\[0\] = a\[0\] + b\[1\] +/// res\[1\] = a\[1\] - b\[0\] +#[inline] +#[cfg_attr(test, assert_instr(qsax))] +pub unsafe fn __qsax(a: int16x2_t, b: int16x2_t) -> int16x2_t { + dsp_call!(arm_qsax, a, b) +} + +/// Returns the 16-bit signed saturated equivalent of +/// +/// res\[0\] = a\[0\] + b\[1\] +/// res\[1\] = a\[1\] + b\[0\] +/// +/// and the GE bits of the APSR are set. +#[inline] +#[cfg_attr(test, assert_instr(sadd16))] +pub unsafe fn __sadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t { + dsp_call!(arm_sadd16, a, b) +} + +/// Returns the 8-bit signed saturated equivalent of +/// +/// res\[0\] = a\[0\] + b\[1\] +/// res\[1\] = a\[1\] + b\[0\] +/// res\[2\] = a\[2\] + b\[2\] +/// res\[3\] = a\[3\] + b\[3\] +/// +/// and the GE bits of the APSR are set. +#[inline] +#[cfg_attr(test, assert_instr(sadd8))] +pub unsafe fn __sadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t { + dsp_call!(arm_sadd8, a, b) +} + +/// Dual 16-bit Signed Multiply with Addition of products +/// and 32-bit accumulation. +/// +/// Returns the 16-bit signed equivalent of +/// res = a\[0\] * b\[0\] + a\[1\] * b\[1\] + c +#[inline] +#[cfg_attr(test, assert_instr(smlad))] +pub unsafe fn __smlad(a: int16x2_t, b: int16x2_t, c: i32) -> i32 { + arm_smlad(transmute(a), transmute(b), c) +} + +/// Dual 16-bit Signed Multiply with Subtraction of products +/// and 32-bit accumulation and overflow detection. +/// +/// Returns the 16-bit signed equivalent of +/// res = a\[0\] * b\[0\] - a\[1\] * b\[1\] + c +#[inline] +#[cfg_attr(test, assert_instr(smlsd))] +pub unsafe fn __smlsd(a: int16x2_t, b: int16x2_t, c: i32) -> i32 { + arm_smlsd(transmute(a), transmute(b), c) +} + +/// Returns the 16-bit signed equivalent of +/// +/// res\[0\] = a\[0\] - b\[1\] +/// res\[1\] = a\[1\] + b\[0\] +/// +/// and the GE bits of the APSR are set. +#[inline] +#[cfg_attr(test, assert_instr(sasx))] +pub unsafe fn __sasx(a: int16x2_t, b: int16x2_t) -> int16x2_t { + dsp_call!(arm_sasx, a, b) +} + +/// Select bytes from each operand according to APSR GE flags +/// +/// Returns the equivalent of +/// +/// res\[0\] = GE\[0\] ? a\[0\] : b\[0\] +/// res\[1\] = GE\[1\] ? a\[1\] : b\[1\] +/// res\[2\] = GE\[2\] ? a\[2\] : b\[2\] +/// res\[3\] = GE\[3\] ? a\[3\] : b\[3\] +/// +/// where GE are bits of APSR +#[inline] +#[cfg_attr(test, assert_instr(sel))] +pub unsafe fn __sel(a: int8x4_t, b: int8x4_t) -> int8x4_t { + dsp_call!(arm_sel, a, b) +} + +/// Signed halving parallel byte-wise addition. +/// +/// Returns the 8-bit signed equivalent of +/// +/// res\[0\] = (a\[0\] + b\[0\]) / 2 +/// res\[1\] = (a\[1\] + b\[1\]) / 2 +/// res\[2\] = (a\[2\] + b\[2\]) / 2 +/// res\[3\] = (a\[3\] + b\[3\]) / 2 +#[inline] +#[cfg_attr(test, assert_instr(shadd8))] +pub unsafe fn __shadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t { + dsp_call!(arm_shadd8, a, b) +} + +/// Signed halving parallel halfword-wise addition. +/// +/// Returns the 16-bit signed equivalent of +/// +/// res\[0\] = (a\[0\] + b\[0\]) / 2 +/// res\[1\] = (a\[1\] + b\[1\]) / 2 +#[inline] +#[cfg_attr(test, assert_instr(shadd16))] +pub unsafe fn __shadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t { + dsp_call!(arm_shadd16, a, b) +} + +/// Signed halving parallel byte-wise subtraction. +/// +/// Returns the 8-bit signed equivalent of +/// +/// res\[0\] = (a\[0\] - b\[0\]) / 2 +/// res\[1\] = (a\[1\] - b\[1\]) / 2 +/// res\[2\] = (a\[2\] - b\[2\]) / 2 +/// res\[3\] = (a\[3\] - b\[3\]) / 2 +#[inline] +#[cfg_attr(test, assert_instr(shsub8))] +pub unsafe fn __shsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t { + dsp_call!(arm_shsub8, a, b) +} + +/// Inserts a `USUB8` instruction. +/// +/// Returns the 8-bit unsigned equivalent of +/// +/// res\[0\] = a\[0\] - a\[0\] +/// res\[1\] = a\[1\] - a\[1\] +/// res\[2\] = a\[2\] - a\[2\] +/// res\[3\] = a\[3\] - a\[3\] +/// +/// where \[0\] is the lower 8 bits and \[3\] is the upper 8 bits. +/// The GE bits of the APSR are set. +#[inline] +#[cfg_attr(test, assert_instr(usub8))] +pub unsafe fn __usub8(a: uint8x4_t, b: uint8x4_t) -> uint8x4_t { + dsp_call!(arm_usub8, a, b) +} + +/// Inserts a `SSUB8` instruction. +/// +/// Returns the 8-bit signed equivalent of +/// +/// res\[0\] = a\[0\] - a\[0\] +/// res\[1\] = a\[1\] - a\[1\] +/// res\[2\] = a\[2\] - a\[2\] +/// res\[3\] = a\[3\] - a\[3\] +/// +/// where \[0\] is the lower 8 bits and \[3\] is the upper 8 bits. +/// The GE bits of the APSR are set. +#[inline] +#[cfg_attr(test, assert_instr(ssub8))] +pub unsafe fn __ssub8(a: int8x4_t, b: int8x4_t) -> int8x4_t { + dsp_call!(arm_ssub8, a, b) +} + +/// Signed halving parallel halfword-wise subtraction. +/// +/// Returns the 16-bit signed equivalent of +/// +/// res\[0\] = (a\[0\] - b\[0\]) / 2 +/// res\[1\] = (a\[1\] - b\[1\]) / 2 +#[inline] +#[cfg_attr(test, assert_instr(shsub16))] +pub unsafe fn __shsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t { + dsp_call!(arm_shsub16, a, b) +} + +/// Signed Dual Multiply Add. +/// +/// Returns the equivalent of +/// +/// res = a\[0\] * b\[0\] + a\[1\] * b\[1\] +/// +/// and sets the Q flag if overflow occurs on the addition. +#[inline] +#[cfg_attr(test, assert_instr(smuad))] +pub unsafe fn __smuad(a: int16x2_t, b: int16x2_t) -> i32 { + arm_smuad(transmute(a), transmute(b)) +} + +/// Signed Dual Multiply Add Reversed. +/// +/// Returns the equivalent of +/// +/// res = a\[0\] * b\[1\] + a\[1\] * b\[0\] +/// +/// and sets the Q flag if overflow occurs on the addition. +#[inline] +#[cfg_attr(test, assert_instr(smuadx))] +pub unsafe fn __smuadx(a: int16x2_t, b: int16x2_t) -> i32 { + arm_smuadx(transmute(a), transmute(b)) +} + +/// Signed Dual Multiply Subtract. +/// +/// Returns the equivalent of +/// +/// res = a\[0\] * b\[0\] - a\[1\] * b\[1\] +/// +/// and sets the Q flag if overflow occurs on the addition. +#[inline] +#[cfg_attr(test, assert_instr(smusd))] +pub unsafe fn __smusd(a: int16x2_t, b: int16x2_t) -> i32 { + arm_smusd(transmute(a), transmute(b)) +} + +/// Signed Dual Multiply Subtract Reversed. +/// +/// Returns the equivalent of +/// +/// res = a\[0\] * b\[1\] - a\[1\] * b\[0\] +/// +/// and sets the Q flag if overflow occurs on the addition. +#[inline] +#[cfg_attr(test, assert_instr(smusdx))] +pub unsafe fn __smusdx(a: int16x2_t, b: int16x2_t) -> i32 { + arm_smusdx(transmute(a), transmute(b)) +} + +/// Sum of 8-bit absolute differences. +/// +/// Returns the 8-bit unsigned equivalent of +/// +/// res = abs(a\[0\] - b\[0\]) + abs(a\[1\] - b\[1\]) +\ +/// (a\[2\] - b\[2\]) + (a\[3\] - b\[3\]) +#[inline] +#[cfg_attr(test, assert_instr(usad8))] +pub unsafe fn __usad8(a: int8x4_t, b: int8x4_t) -> u32 { + arm_usad8(transmute(a), transmute(b)) +} + +/// Sum of 8-bit absolute differences and constant. +/// +/// Returns the 8-bit unsigned equivalent of +/// +/// res = abs(a\[0\] - b\[0\]) + abs(a\[1\] - b\[1\]) +\ +/// (a\[2\] - b\[2\]) + (a\[3\] - b\[3\]) + c +#[inline] +#[cfg_attr(test, assert_instr(usad8))] +pub unsafe fn __usada8(a: int8x4_t, b: int8x4_t, c: u32) -> u32 { + __usad8(a, b) + c +} + +#[cfg(test)] +mod tests { + use crate::core_arch::simd::{i16x2, i8x4, u8x4}; + use std::mem::transmute; + use stdarch_test::simd_test; + + #[test] + fn qadd8() { + unsafe { + let a = i8x4::new(1, 2, 3, i8::MAX); + let b = i8x4::new(2, -1, 0, 1); + let c = i8x4::new(3, 1, 3, i8::MAX); + let r: i8x4 = dsp_call!(super::__qadd8, a, b); + assert_eq!(r, c); + } + } + + #[test] + fn qsub8() { + unsafe { + let a = i8x4::new(1, 2, 3, i8::MIN); + let b = i8x4::new(2, -1, 0, 1); + let c = i8x4::new(-1, 3, 3, i8::MIN); + let r: i8x4 = dsp_call!(super::__qsub8, a, b); + assert_eq!(r, c); + } + } + + #[test] + fn qadd16() { + unsafe { + let a = i16x2::new(1, 2); + let b = i16x2::new(2, -1); + let c = i16x2::new(3, 1); + let r: i16x2 = dsp_call!(super::__qadd16, a, b); + assert_eq!(r, c); + } + } + + #[test] + fn qsub16() { + unsafe { + let a = i16x2::new(10, 20); + let b = i16x2::new(20, -10); + let c = i16x2::new(-10, 30); + let r: i16x2 = dsp_call!(super::__qsub16, a, b); + assert_eq!(r, c); + } + } + + #[test] + fn qasx() { + unsafe { + let a = i16x2::new(1, i16::MAX); + let b = i16x2::new(2, 2); + let c = i16x2::new(-1, i16::MAX); + let r: i16x2 = dsp_call!(super::__qasx, a, b); + assert_eq!(r, c); + } + } + + #[test] + fn qsax() { + unsafe { + let a = i16x2::new(1, i16::MAX); + let b = i16x2::new(2, 2); + let c = i16x2::new(3, i16::MAX - 2); + let r: i16x2 = dsp_call!(super::__qsax, a, b); + assert_eq!(r, c); + } + } + + #[test] + fn sadd16() { + unsafe { + let a = i16x2::new(1, i16::MAX); + let b = i16x2::new(2, 2); + let c = i16x2::new(3, -i16::MAX); + let r: i16x2 = dsp_call!(super::__sadd16, a, b); + assert_eq!(r, c); + } + } + + #[test] + fn sadd8() { + unsafe { + let a = i8x4::new(1, 2, 3, i8::MAX); + let b = i8x4::new(4, 3, 2, 2); + let c = i8x4::new(5, 5, 5, -i8::MAX); + let r: i8x4 = dsp_call!(super::__sadd8, a, b); + assert_eq!(r, c); + } + } + + #[test] + fn sasx() { + unsafe { + let a = i16x2::new(1, 2); + let b = i16x2::new(2, 1); + let c = i16x2::new(0, 4); + let r: i16x2 = dsp_call!(super::__sasx, a, b); + assert_eq!(r, c); + } + } + + #[test] + fn smlad() { + unsafe { + let a = i16x2::new(1, 2); + let b = i16x2::new(3, 4); + let r = super::__smlad(transmute(a), transmute(b), 10); + assert_eq!(r, (1 * 3) + (2 * 4) + 10); + } + } + + #[test] + fn smlsd() { + unsafe { + let a = i16x2::new(1, 2); + let b = i16x2::new(3, 4); + let r = super::__smlsd(transmute(a), transmute(b), 10); + assert_eq!(r, ((1 * 3) - (2 * 4)) + 10); + } + } + + #[test] + fn sel() { + unsafe { + let a = i8x4::new(1, 2, 3, i8::MAX); + let b = i8x4::new(4, 3, 2, 2); + // call sadd8() to set GE bits + super::__sadd8(transmute(a), transmute(b)); + let c = i8x4::new(1, 2, 3, i8::MAX); + let r: i8x4 = dsp_call!(super::__sel, a, b); + assert_eq!(r, c); + } + } + + #[test] + fn shadd8() { + unsafe { + let a = i8x4::new(1, 2, 3, 4); + let b = i8x4::new(5, 4, 3, 2); + let c = i8x4::new(3, 3, 3, 3); + let r: i8x4 = dsp_call!(super::__shadd8, a, b); + assert_eq!(r, c); + } + } + + #[test] + fn shadd16() { + unsafe { + let a = i16x2::new(1, 2); + let b = i16x2::new(5, 4); + let c = i16x2::new(3, 3); + let r: i16x2 = dsp_call!(super::__shadd16, a, b); + assert_eq!(r, c); + } + } + + #[test] + fn shsub8() { + unsafe { + let a = i8x4::new(1, 2, 3, 4); + let b = i8x4::new(5, 4, 3, 2); + let c = i8x4::new(-2, -1, 0, 1); + let r: i8x4 = dsp_call!(super::__shsub8, a, b); + assert_eq!(r, c); + } + } + + #[test] + fn ssub8() { + unsafe { + let a = i8x4::new(1, 2, 3, 4); + let b = i8x4::new(5, 4, 3, 2); + let c = i8x4::new(-4, -2, 0, 2); + let r: i8x4 = dsp_call!(super::__ssub8, a, b); + assert_eq!(r, c); + } + } + + #[test] + fn usub8() { + unsafe { + let a = u8x4::new(1, 2, 3, 4); + let b = u8x4::new(5, 4, 3, 2); + let c = u8x4::new(252, 254, 0, 2); + let r: u8x4 = dsp_call!(super::__usub8, a, b); + assert_eq!(r, c); + } + } + + #[test] + fn shsub16() { + unsafe { + let a = i16x2::new(1, 2); + let b = i16x2::new(5, 4); + let c = i16x2::new(-2, -1); + let r: i16x2 = dsp_call!(super::__shsub16, a, b); + assert_eq!(r, c); + } + } + + #[test] + fn smuad() { + unsafe { + let a = i16x2::new(1, 2); + let b = i16x2::new(5, 4); + let r = super::__smuad(transmute(a), transmute(b)); + assert_eq!(r, 13); + } + } + + #[test] + fn smuadx() { + unsafe { + let a = i16x2::new(1, 2); + let b = i16x2::new(5, 4); + let r = super::__smuadx(transmute(a), transmute(b)); + assert_eq!(r, 14); + } + } + + #[test] + fn smusd() { + unsafe { + let a = i16x2::new(1, 2); + let b = i16x2::new(5, 4); + let r = super::__smusd(transmute(a), transmute(b)); + assert_eq!(r, -3); + } + } + + #[test] + fn smusdx() { + unsafe { + let a = i16x2::new(1, 2); + let b = i16x2::new(5, 4); + let r = super::__smusdx(transmute(a), transmute(b)); + assert_eq!(r, -6); + } + } + + #[test] + fn usad8() { + unsafe { + let a = i8x4::new(1, 2, 3, 4); + let b = i8x4::new(4, 3, 2, 1); + let r = super::__usad8(transmute(a), transmute(b)); + assert_eq!(r, 8); + } + } + + #[test] + fn usad8a() { + unsafe { + let a = i8x4::new(1, 2, 3, 4); + let b = i8x4::new(4, 3, 2, 1); + let c = 10; + let r = super::__usada8(transmute(a), transmute(b), c); + assert_eq!(r, 8 + c); + } + } +} diff --git a/library/stdarch/crates/core_arch/src/arm/v6.rs b/library/stdarch/crates/core_arch/src/arm/v6.rs new file mode 100644 index 000000000..5df30cd62 --- /dev/null +++ b/library/stdarch/crates/core_arch/src/arm/v6.rs @@ -0,0 +1,49 @@ +//! ARMv6 intrinsics. +//! +//! The reference is [ARMv6-M Architecture Reference Manual][armv6m]. +//! +//! [armv6m]: +//! http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0419c/index. +//! html + +#[cfg(test)] +use stdarch_test::assert_instr; + +/// Reverse the order of the bytes. +#[inline] +#[cfg_attr(test, assert_instr(rev))] +pub unsafe fn _rev_u16(x: u16) -> u16 { + x.swap_bytes() as u16 +} + +/// Reverse the order of the bytes. +#[inline] +#[cfg_attr(test, assert_instr(rev))] +pub unsafe fn _rev_u32(x: u32) -> u32 { + x.swap_bytes() as u32 +} + +#[cfg(test)] +mod tests { + use crate::core_arch::arm::v6; + + #[test] + fn _rev_u16() { + unsafe { + assert_eq!( + v6::_rev_u16(0b0000_0000_1111_1111_u16), + 0b1111_1111_0000_0000_u16 + ); + } + } + + #[test] + fn _rev_u32() { + unsafe { + assert_eq!( + v6::_rev_u32(0b0000_0000_1111_1111_0000_0000_1111_1111_u32), + 0b1111_1111_0000_0000_1111_1111_0000_0000_u32 + ); + } + } +} diff --git a/library/stdarch/crates/core_arch/src/arm/v7.rs b/library/stdarch/crates/core_arch/src/arm/v7.rs new file mode 100644 index 000000000..e7507f9b9 --- /dev/null +++ b/library/stdarch/crates/core_arch/src/arm/v7.rs @@ -0,0 +1,88 @@ +//! ARMv7 intrinsics. +//! +//! The reference is [ARMv7-M Architecture Reference Manual (Issue +//! E.b)][armv7m]. +//! +//! [armv7m]: +//! http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0403e. +//! b/index.html + +pub use super::v6::*; + +#[cfg(test)] +use stdarch_test::assert_instr; + +/// Count Leading Zeros. +#[inline] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))] +// FIXME: https://github.com/rust-lang/stdarch/issues/382 +// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))] +pub unsafe fn _clz_u8(x: u8) -> u8 { + x.leading_zeros() as u8 +} + +/// Count Leading Zeros. +#[inline] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))] +// FIXME: https://github.com/rust-lang/stdarch/issues/382 +// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))] +pub unsafe fn _clz_u16(x: u16) -> u16 { + x.leading_zeros() as u16 +} + +/// Count Leading Zeros. +#[inline] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(clz))] +// FIXME: https://github.com/rust-lang/stdarch/issues/382 +// #[cfg_attr(all(test, target_arch = "arm"), assert_instr(clz))] +pub unsafe fn _clz_u32(x: u32) -> u32 { + x.leading_zeros() as u32 +} + +/// Reverse the bit order. +#[inline] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(test, assert_instr(rbit))] +pub unsafe fn _rbit_u32(x: u32) -> u32 { + crate::intrinsics::bitreverse(x) +} + +#[cfg(test)] +mod tests { + use crate::core_arch::arm::v7; + + #[test] + fn _clz_u8() { + unsafe { + assert_eq!(v7::_clz_u8(0b0000_1010u8), 4u8); + } + } + + #[test] + fn _clz_u16() { + unsafe { + assert_eq!(v7::_clz_u16(0b0000_1010u16), 12u16); + } + } + + #[test] + fn _clz_u32() { + unsafe { + assert_eq!(v7::_clz_u32(0b0000_1010u32), 28u32); + } + } + + #[test] + #[cfg(dont_compile_me)] // FIXME need to add `v7` upstream in rustc + fn _rbit_u32() { + unsafe { + assert_eq!( + v7::_rbit_u32(0b0000_1010u32), + 0b0101_0000_0000_0000_0000_0000_0000_0000u32 + ); + } + } +} -- cgit v1.2.3