diff options
Diffstat (limited to 'library/stdarch/crates')
22 files changed, 1348 insertions, 203 deletions
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs index 043f7ed51..0559aea83 100644 --- a/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs +++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/mod.rs @@ -12461,30 +12461,30 @@ mod tests { } #[simd_test(enable = "neon,i8mm")] unsafe fn test_vmmlaq_s32() { - let a: i32x4 = i32x4::new(1, 3, 4, 9); - let b: i8x16 = i8x16::new(1, 21, 31, 14, 5, 6, 17, 8, 9, 13, 15, 12, 13, 19, 20, 16); - let c: i8x16 = i8x16::new(12, 22, 3, 4, 5, 56, 7, 8, 91, 10, 11, 15, 13, 14, 17, 16); - let e: i32x4 = i32x4::new(1, 2, 3, 4); + let a = i32x4::new(1, 3, 4, -0x10000); + let b = i8x16::new(1, 21, 31, 14, 5, 6, -128, 8, 9, 13, 15, 12, 13, -1, 20, 16); + let c = i8x16::new(12, 22, 3, 4, -1, 56, 7, 8, 91, 10, -128, 15, 13, 14, 17, 16); + let e = i32x4::new(123, -5353, 690, -65576); let r: i32x4 = transmute(vmmlaq_s32(transmute(a), transmute(b), transmute(c))); assert_eq!(r, e); } #[simd_test(enable = "neon,i8mm")] unsafe fn test_vmmlaq_u32() { - let a: u32x4 = u32x4::new(1, 3, 4, 9); - let b: i8x16 = i8x16::new(1, 21, 31, 14, 5, 6, 17, 8, 9, 13, 15, 12, 13, 19, 20, 16); - let c: i8x16 = i8x16::new(12, 22, 3, 4, 5, 56, 7, 8, 91, 10, 11, 15, 13, 14, 17, 16); - let e: u32x4 = u32x4::new(1, 2, 3, 4); + let a = u32x4::new(1, 3, 4, 0xffff0000); + let b = u8x16::new(1, 21, 31, 14, 5, 6, 128, 8, 9, 13, 15, 12, 13, 255, 20, 16); + let c = u8x16::new(12, 22, 3, 4, 255, 56, 7, 8, 91, 10, 128, 15, 13, 14, 17, 16); + let e = u32x4::new(3195, 6935, 18354, 4294909144); let r: u32x4 = transmute(vmmlaq_u32(transmute(a), transmute(b), transmute(c))); assert_eq!(r, e); } #[simd_test(enable = "neon,i8mm")] unsafe fn test_vusmmlaq_s32() { - let a: i32x4 = i32x4::new(1, 3, 4, 9); - let b: i8x16 = i8x16::new(1, 21, 31, 14, 5, 6, 17, 8, 9, 13, 15, 12, 13, 19, 20, 16); - let c: i8x16 = i8x16::new(12, 22, 3, 4, 5, 56, 7, 8, 91, 10, 11, 15, 13, 14, 17, 16); - let e: i32x4 = i32x4::new(1, 2, 3, 4); + let a = i32x4::new(1, 3, 4, -0x10000); + let b = u8x16::new(1, 21, 31, 14, 5, 6, 128, 8, 9, 13, 15, 12, 13, 255, 20, 16); + let c = i8x16::new(12, 22, 3, 4, -1, 56, 7, 8, 91, 10, -128, 15, 13, 14, 17, 16); + let e = i32x4::new(1915, -1001, 15026, -61992); let r: i32x4 = transmute(vusmmlaq_s32(transmute(a), transmute(b), transmute(c))); assert_eq!(r, e); } diff --git a/library/stdarch/crates/core_arch/src/lib.rs b/library/stdarch/crates/core_arch/src/lib.rs index 9240d0e84..5a9727a0a 100644 --- a/library/stdarch/crates/core_arch/src/lib.rs +++ b/library/stdarch/crates/core_arch/src/lib.rs @@ -19,6 +19,7 @@ doc_cfg, tbm_target_feature, sse4a_target_feature, + riscv_target_feature, arm_target_feature, cmpxchg16b_target_feature, avx512_target_feature, @@ -30,8 +31,8 @@ f16c_target_feature, allow_internal_unstable, decl_macro, - bench_black_box, - asm_const + asm_const, + target_feature_11 )] #![cfg_attr(test, feature(test, abi_vectorcall))] #![deny(clippy::missing_inline_in_public_items)] diff --git a/library/stdarch/crates/core_arch/src/macros.rs b/library/stdarch/crates/core_arch/src/macros.rs index 1e6a3f405..1c917c52b 100644 --- a/library/stdarch/crates/core_arch/src/macros.rs +++ b/library/stdarch/crates/core_arch/src/macros.rs @@ -101,11 +101,11 @@ macro_rules! simd_shuffle2 { const IDX: [u32; 2] = $idx; } - simd_shuffle2($x, $y, ConstParam::<$($imm),+>::IDX) + simd_shuffle($x, $y, ConstParam::<$($imm),+>::IDX) }}; ($x:expr, $y:expr, $idx:expr $(,)?) => {{ const IDX: [u32; 2] = $idx; - simd_shuffle2($x, $y, IDX) + simd_shuffle($x, $y, IDX) }}; } @@ -117,11 +117,11 @@ macro_rules! simd_shuffle4 { const IDX: [u32; 4] = $idx; } - simd_shuffle4($x, $y, ConstParam::<$($imm),+>::IDX) + simd_shuffle($x, $y, ConstParam::<$($imm),+>::IDX) }}; ($x:expr, $y:expr, $idx:expr $(,)?) => {{ const IDX: [u32; 4] = $idx; - simd_shuffle4($x, $y, IDX) + simd_shuffle($x, $y, IDX) }}; } @@ -133,11 +133,11 @@ macro_rules! simd_shuffle8 { const IDX: [u32; 8] = $idx; } - simd_shuffle8($x, $y, ConstParam::<$($imm),+>::IDX) + simd_shuffle($x, $y, ConstParam::<$($imm),+>::IDX) }}; ($x:expr, $y:expr, $idx:expr $(,)?) => {{ const IDX: [u32; 8] = $idx; - simd_shuffle8($x, $y, IDX) + simd_shuffle($x, $y, IDX) }}; } @@ -149,11 +149,11 @@ macro_rules! simd_shuffle16 { const IDX: [u32; 16] = $idx; } - simd_shuffle16($x, $y, ConstParam::<$($imm),+>::IDX) + simd_shuffle($x, $y, ConstParam::<$($imm),+>::IDX) }}; ($x:expr, $y:expr, $idx:expr $(,)?) => {{ const IDX: [u32; 16] = $idx; - simd_shuffle16($x, $y, IDX) + simd_shuffle($x, $y, IDX) }}; } @@ -165,11 +165,11 @@ macro_rules! simd_shuffle32 { const IDX: [u32; 32] = $idx; } - simd_shuffle32($x, $y, ConstParam::<$($imm),+>::IDX) + simd_shuffle($x, $y, ConstParam::<$($imm),+>::IDX) }}; ($x:expr, $y:expr, $idx:expr $(,)?) => {{ const IDX: [u32; 32] = $idx; - simd_shuffle32($x, $y, IDX) + simd_shuffle($x, $y, IDX) }}; } @@ -181,10 +181,10 @@ macro_rules! simd_shuffle64 { const IDX: [u32; 64] = $idx; } - simd_shuffle64($x, $y, ConstParam::<$($imm),+>::IDX) + simd_shuffle($x, $y, ConstParam::<$($imm),+>::IDX) }}; ($x:expr, $y:expr, $idx:expr $(,)?) => {{ const IDX: [u32; 64] = $idx; - simd_shuffle64($x, $y, IDX) + simd_shuffle($x, $y, IDX) }}; } diff --git a/library/stdarch/crates/core_arch/src/mod.rs b/library/stdarch/crates/core_arch/src/mod.rs index 20751eeec..2f7af22cb 100644 --- a/library/stdarch/crates/core_arch/src/mod.rs +++ b/library/stdarch/crates/core_arch/src/mod.rs @@ -3,6 +3,9 @@ #[macro_use] mod macros; +#[cfg(any(target_arch = "riscv32", target_arch = "riscv64", doc))] +mod riscv_shared; + #[cfg(any(target_arch = "arm", target_arch = "aarch64", doc))] mod arm_shared; @@ -276,10 +279,6 @@ mod aarch64; #[doc(cfg(any(target_arch = "arm")))] mod arm; -#[cfg(any(target_arch = "riscv32", target_arch = "riscv64", doc))] -#[doc(cfg(any(target_arch = "riscv32", target_arch = "riscv64")))] -mod riscv_shared; - #[cfg(any(target_arch = "riscv64", doc))] #[doc(cfg(any(target_arch = "riscv64")))] mod riscv64; diff --git a/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs b/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs index 347735df1..0e35fe1f1 100644 --- a/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs +++ b/library/stdarch/crates/core_arch/src/riscv_shared/mod.rs @@ -1,4 +1,7 @@ //! Shared RISC-V intrinsics +mod p; + +pub use p::*; use crate::arch::asm; @@ -469,6 +472,17 @@ pub unsafe fn hinval_gvma_vmid(vmid: usize) { asm!(".insn r 0x73, 0, 0x33, x0, x0, {}", in(reg) vmid, options(nostack)) } +/// Invalidate hypervisor translation cache for all virtual machines and guest physical addresses +/// +/// This instruction invalidates any address-translation cache entries that an +/// `HFENCE.GVMA` instruction with the same values of `gaddr` and `vmid` would invalidate. +/// +/// This fence specifies all guest physical addresses and all virtual machines. +#[inline] +pub unsafe fn hinval_gvma_all() { + asm!(".insn r 0x73, 0, 0x33, x0, x0, x0", options(nostack)) +} + /// Reads the floating-point control and status register `fcsr` /// /// Register `fcsr` is a 32-bit read/write register that selects the dynamic rounding mode @@ -574,17 +588,6 @@ pub fn fsflags(value: u32) -> u32 { original } -/// Invalidate hypervisor translation cache for all virtual machines and guest physical addresses -/// -/// This instruction invalidates any address-translation cache entries that an -/// `HFENCE.GVMA` instruction with the same values of `gaddr` and `vmid` would invalidate. -/// -/// This fence specifies all guest physical addresses and all virtual machines. -#[inline] -pub unsafe fn hinval_gvma_all() { - asm!(".insn r 0x73, 0, 0x33, x0, x0, x0", options(nostack)) -} - /// `P0` transformation function as is used in the SM3 hash algorithm /// /// This function is included in `Zksh` extension. It's defined as: @@ -602,12 +605,10 @@ pub unsafe fn hinval_gvma_all() { /// According to RISC-V Cryptography Extensions, Volume I, the execution latency of /// this instruction must always be independent from the data it operates on. #[inline] +#[target_feature(enable = "zksh")] pub fn sm3p0(x: u32) -> u32 { let ans: u32; - unsafe { - // asm!("sm3p0 {}, {}", out(reg) ans, in(reg) x, options(nomem, nostack)) - asm!(".insn i 0x13, 0x1, {}, {}, 0x108", out(reg) ans, in(reg) x, options(nomem, nostack)) - }; + unsafe { asm!("sm3p0 {}, {}", lateout(reg) ans, in(reg) x, options(pure, nomem, nostack)) }; ans } @@ -634,12 +635,10 @@ pub fn sm3p0(x: u32) -> u32 { /// According to RISC-V Cryptography Extensions, Volume I, the execution latency of /// this instruction must always be independent from the data it operates on. #[inline] +#[target_feature(enable = "zksh")] pub fn sm3p1(x: u32) -> u32 { let ans: u32; - unsafe { - // asm!("sm3p1 {}, {}", out(reg) ans, in(reg) x, options(nomem, nostack)) - asm!(".insn i 0x13, 0x1, {}, {}, 0x109", out(reg) ans, in(reg) x, options(nomem, nostack)) - }; + unsafe { asm!("sm3p1 {}, {}", lateout(reg) ans, in(reg) x, options(pure, nomem, nostack)) }; ans } @@ -674,33 +673,28 @@ pub fn sm3p1(x: u32) -> u32 { /// It can be implemented by `sm4ed` instruction like: /// /// ```no_run +/// # #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))] +/// # fn round_function(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 { +/// # #[cfg(target_arch = "riscv32")] use core::arch::riscv32::sm4ed; +/// # #[cfg(target_arch = "riscv64")] use core::arch::riscv64::sm4ed; /// let a = x1 ^ x2 ^ x3 ^ rk; /// let c0 = sm4ed::<0>(x0, a); /// let c1 = sm4ed::<1>(c0, a); // c1 represents c[0..=1], etc. /// let c2 = sm4ed::<2>(c1, a); /// let c3 = sm4ed::<3>(c2, a); /// return c3; // c3 represents c[0..=3] +/// # } /// ``` /// /// According to RISC-V Cryptography Extensions, Volume I, the execution latency of /// this instruction must always be independent from the data it operates on. +#[inline] +#[target_feature(enable = "zksed")] pub fn sm4ed<const BS: u8>(x: u32, a: u32) -> u32 { static_assert!(BS: u8 where BS <= 3); let ans: u32; - match BS { - 0 => unsafe { - asm!(".insn r 0x33, 0, 0x18, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) a, options(nomem, nostack)) - }, - 1 => unsafe { - asm!(".insn r 0x33, 0, 0x38, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) a, options(nomem, nostack)) - }, - 2 => unsafe { - asm!(".insn r 0x33, 0, 0x58, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) a, options(nomem, nostack)) - }, - 3 => unsafe { - asm!(".insn r 0x33, 0, 0x78, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) a, options(nomem, nostack)) - }, - _ => unreachable!(), + unsafe { + asm!("sm4ed {}, {}, {}, {}", lateout(reg) ans, in(reg) x, in(reg) a, const BS, options(pure, nomem, nostack)) }; ans } @@ -739,33 +733,28 @@ pub fn sm4ed<const BS: u8>(x: u32, a: u32) -> u32 { /// Hence, the key schedule operation can be implemented by `sm4ks` instruction like: /// /// ```no_run +/// # #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))] +/// # fn key_schedule(k0: u32, k1: u32, k2: u32, k3: u32, ck_i: u32) -> u32 { +/// # #[cfg(target_arch = "riscv32")] use core::arch::riscv32::sm4ks; +/// # #[cfg(target_arch = "riscv64")] use core::arch::riscv64::sm4ks; /// let k = k1 ^ k2 ^ k3 ^ ck_i; /// let c0 = sm4ks::<0>(k0, k); /// let c1 = sm4ks::<1>(c0, k); // c1 represents c[0..=1], etc. /// let c2 = sm4ks::<2>(c1, k); /// let c3 = sm4ks::<3>(c2, k); /// return c3; // c3 represents c[0..=3] +/// # } /// ``` /// /// According to RISC-V Cryptography Extensions, Volume I, the execution latency of /// this instruction must always be independent from the data it operates on. +#[inline] +#[target_feature(enable = "zksed")] pub fn sm4ks<const BS: u8>(x: u32, k: u32) -> u32 { static_assert!(BS: u8 where BS <= 3); let ans: u32; - match BS { - 0 => unsafe { - asm!(".insn r 0x33, 0, 0x1A, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) k, options(nomem, nostack)) - }, - 1 => unsafe { - asm!(".insn r 0x33, 0, 0x3A, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) k, options(nomem, nostack)) - }, - 2 => unsafe { - asm!(".insn r 0x33, 0, 0x5A, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) k, options(nomem, nostack)) - }, - 3 => unsafe { - asm!(".insn r 0x33, 0, 0x7A, {}, {}, {}", out(reg) ans, in(reg) x, in(reg) k, options(nomem, nostack)) - }, - _ => unreachable!(), + unsafe { + asm!("sm4ks {}, {}, {}, {}", lateout(reg) ans, in(reg) x, in(reg) k, const BS, options(pure, nomem, nostack)) }; ans } diff --git a/library/stdarch/crates/core_arch/src/riscv_shared/p.rs b/library/stdarch/crates/core_arch/src/riscv_shared/p.rs new file mode 100644 index 000000000..a26044aee --- /dev/null +++ b/library/stdarch/crates/core_arch/src/riscv_shared/p.rs @@ -0,0 +1,1061 @@ +//! RISC-V Packed SIMD intrinsics; shared part. +//! +//! RV64 only part is placed in riscv64 folder. +use crate::arch::asm; + +/// Adds packed 16-bit signed numbers, discarding overflow bits +#[inline] +pub fn add16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x20, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Halves the sum of packed 16-bit signed numbers, dropping least bits +#[inline] +pub fn radd16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x00, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Halves the sum of packed 16-bit unsigned numbers, dropping least bits +#[inline] +pub fn uradd16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x10, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Adds packed 16-bit signed numbers, saturating at the numeric bounds +#[inline] +pub fn kadd16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x08, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Adds packed 16-bit unsigned numbers, saturating at the numeric bounds +#[inline] +pub fn ukadd16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x18, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Subtracts packed 16-bit signed numbers, discarding overflow bits +#[inline] +pub fn sub16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x21, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Halves the subtraction result of packed 16-bit signed numbers, dropping least bits +#[inline] +pub fn rsub16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x01, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Halves the subtraction result of packed 16-bit unsigned numbers, dropping least bits +#[inline] +pub fn ursub16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x11, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Subtracts packed 16-bit signed numbers, saturating at the numeric bounds +#[inline] +pub fn ksub16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x09, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Subtracts packed 16-bit unsigned numbers, saturating at the numeric bounds +#[inline] +pub fn uksub16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x19, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Cross adds and subtracts packed 16-bit signed numbers, discarding overflow bits +#[inline] +pub fn cras16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x22, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Cross halves of adds and subtracts packed 16-bit signed numbers, dropping least bits +#[inline] +pub fn rcras16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x02, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Cross halves of adds and subtracts packed 16-bit unsigned numbers, dropping least bits +#[inline] +pub fn urcras16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x12, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Cross adds and subtracts packed 16-bit signed numbers, saturating at the numeric bounds +#[inline] +pub fn kcras16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x0A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Cross adds and subtracts packed 16-bit unsigned numbers, saturating at the numeric bounds +#[inline] +pub fn ukcras16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x1A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Cross subtracts and adds packed 16-bit signed numbers, discarding overflow bits +#[inline] +pub fn crsa16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x23, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Cross halves of subtracts and adds packed 16-bit signed numbers, dropping least bits +#[inline] +pub fn rcrsa16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x03, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Cross halves of subtracts and adds packed 16-bit unsigned numbers, dropping least bits +#[inline] +pub fn urcrsa16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x13, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Cross subtracts and adds packed 16-bit signed numbers, saturating at the numeric bounds +#[inline] +pub fn kcrsa16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x0B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Cross subtracts and adds packed 16-bit unsigned numbers, saturating at the numeric bounds +#[inline] +pub fn ukcrsa16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x1B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Straight adds and subtracts packed 16-bit signed numbers, discarding overflow bits +#[inline] +pub fn stas16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x7A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Straight halves of adds and subtracts packed 16-bit signed numbers, dropping least bits +#[inline] +pub fn rstas16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x5A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Straight halves of adds and subtracts packed 16-bit unsigned numbers, dropping least bits +#[inline] +pub fn urstas16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x6A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Straight adds and subtracts packed 16-bit signed numbers, saturating at the numeric bounds +#[inline] +pub fn kstas16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x62, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Straight adds and subtracts packed 16-bit unsigned numbers, saturating at the numeric bounds +#[inline] +pub fn ukstas16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x72, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Straight subtracts and adds packed 16-bit signed numbers, discarding overflow bits +#[inline] +pub fn stsa16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x7B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Straight halves of subtracts and adds packed 16-bit signed numbers, dropping least bits +#[inline] +pub fn rstsa16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x5B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Straight halves of subtracts and adds packed 16-bit unsigned numbers, dropping least bits +#[inline] +pub fn urstsa16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x6B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Straight subtracts and adds packed 16-bit signed numbers, saturating at the numeric bounds +#[inline] +pub fn kstsa16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x63, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Straight subtracts and adds packed 16-bit unsigned numbers, saturating at the numeric bounds +#[inline] +pub fn ukstsa16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x73, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Adds packed 8-bit signed numbers, discarding overflow bits +#[inline] +pub fn add8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x24, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Halves the sum of packed 8-bit signed numbers, dropping least bits +#[inline] +pub fn radd8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x04, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Halves the sum of packed 8-bit unsigned numbers, dropping least bits +#[inline] +pub fn uradd8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x14, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Adds packed 8-bit signed numbers, saturating at the numeric bounds +#[inline] +pub fn kadd8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x0C, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Adds packed 8-bit unsigned numbers, saturating at the numeric bounds +#[inline] +pub fn ukadd8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x1C, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Subtracts packed 8-bit signed numbers, discarding overflow bits +#[inline] +pub fn sub8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x25, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Halves the subtraction result of packed 8-bit signed numbers, dropping least bits +#[inline] +pub fn rsub8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x05, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Halves the subtraction result of packed 8-bit unsigned numbers, dropping least bits +#[inline] +pub fn ursub8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x15, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Subtracts packed 8-bit signed numbers, saturating at the numeric bounds +#[inline] +pub fn ksub8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x0D, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Subtracts packed 8-bit unsigned numbers, saturating at the numeric bounds +#[inline] +pub fn uksub8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x1D, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Arithmetic right shift packed 16-bit elements without rounding up +#[inline] +pub fn sra16(a: usize, b: u32) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x28, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Arithmetic right shift packed 16-bit elements with rounding up +#[inline] +pub fn sra16u(a: usize, b: u32) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x30, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Logical right shift packed 16-bit elements without rounding up +#[inline] +pub fn srl16(a: usize, b: u32) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x29, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Logical right shift packed 16-bit elements with rounding up +#[inline] +pub fn srl16u(a: usize, b: u32) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x31, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Logical left shift packed 16-bit elements, discarding overflow bits +#[inline] +pub fn sll16(a: usize, b: u32) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x2A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Logical left shift packed 16-bit elements, saturating at the numeric bounds +#[inline] +pub fn ksll16(a: usize, b: u32) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x32, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Logical saturating left then arithmetic right shift packed 16-bit elements +#[inline] +pub fn kslra16(a: usize, b: i32) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x2B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Logical saturating left then arithmetic right shift packed 16-bit elements +#[inline] +pub fn kslra16u(a: usize, b: i32) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x33, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Arithmetic right shift packed 8-bit elements without rounding up +#[inline] +pub fn sra8(a: usize, b: u32) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x2C, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Arithmetic right shift packed 8-bit elements with rounding up +#[inline] +pub fn sra8u(a: usize, b: u32) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x34, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Logical right shift packed 8-bit elements without rounding up +#[inline] +pub fn srl8(a: usize, b: u32) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x2D, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Logical right shift packed 8-bit elements with rounding up +#[inline] +pub fn srl8u(a: usize, b: u32) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x35, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Logical left shift packed 8-bit elements, discarding overflow bits +#[inline] +pub fn sll8(a: usize, b: u32) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x2E, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Logical left shift packed 8-bit elements, saturating at the numeric bounds +#[inline] +pub fn ksll8(a: usize, b: u32) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x36, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Logical saturating left then arithmetic right shift packed 8-bit elements +#[inline] +pub fn kslra8(a: usize, b: i32) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x2F, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Logical saturating left then arithmetic right shift packed 8-bit elements +#[inline] +pub fn kslra8u(a: usize, b: i32) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x37, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Compare equality for packed 16-bit elements +#[inline] +pub fn cmpeq16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x26, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Compare whether 16-bit packed signed integers are less than the others +#[inline] +pub fn scmplt16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x06, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Compare whether 16-bit packed signed integers are less than or equal to the others +#[inline] +pub fn scmple16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x0E, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Compare whether 16-bit packed unsigned integers are less than the others +#[inline] +pub fn ucmplt16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x16, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Compare whether 16-bit packed unsigned integers are less than or equal to the others +#[inline] +pub fn ucmple16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x1E, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Compare equality for packed 8-bit elements +#[inline] +pub fn cmpeq8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x27, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Compare whether 8-bit packed signed integers are less than the others +#[inline] +pub fn scmplt8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x07, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Compare whether 8-bit packed signed integers are less than or equal to the others +#[inline] +pub fn scmple8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x0F, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Compare whether 8-bit packed unsigned integers are less than the others +#[inline] +pub fn ucmplt8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x17, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Compare whether 8-bit packed unsigned integers are less than or equal to the others +#[inline] +pub fn ucmple8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x1F, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Get minimum values from 16-bit packed signed integers +#[inline] +pub fn smin16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x40, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Get minimum values from 16-bit packed unsigned integers +#[inline] +pub fn umin16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x48, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Get maximum values from 16-bit packed signed integers +#[inline] +pub fn smax16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x41, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Get maximum values from 16-bit packed unsigned integers +#[inline] +pub fn umax16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x49, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/* todo: sclip16, uclip16 */ + +/// Compute the absolute value of packed 16-bit signed integers +#[inline] +pub fn kabs16(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xAD1", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Count the number of redundant sign bits of the packed 16-bit elements +#[inline] +pub fn clrs16(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xAE8", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Count the number of leading zero bits of the packed 16-bit elements +#[inline] +pub fn clz16(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xAE9", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Swap the 16-bit halfwords within each 32-bit word of a register +#[inline] +pub fn swap16(a: usize) -> usize { + let value: usize; + // this instruction is an alias for `pkbt rd, rs1, rs1`. + unsafe { + asm!(".insn r 0x77, 0x0, 0x0F, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Get minimum values from 8-bit packed signed integers +#[inline] +pub fn smin8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x44, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Get minimum values from 8-bit packed unsigned integers +#[inline] +pub fn umin8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x4C, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Get maximum values from 8-bit packed signed integers +#[inline] +pub fn smax8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x45, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Get maximum values from 8-bit packed unsigned integers +#[inline] +pub fn umax8(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x4D, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/* todo: sclip8, uclip8 */ + +/// Compute the absolute value of packed 8-bit signed integers +#[inline] +pub fn kabs8(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xAD0", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Count the number of redundant sign bits of the packed 8-bit elements +#[inline] +pub fn clrs8(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xAE0", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Count the number of leading zero bits of the packed 8-bit elements +#[inline] +pub fn clz8(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xAE1", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Swap the 8-bit bytes within each 16-bit halfword of a register. +#[inline] +pub fn swap8(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xAD8", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Unpack first and zeroth into two 16-bit signed halfwords in each 32-bit chunk +#[inline] +pub fn sunpkd810(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xAC8", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Unpack second and zeroth into two 16-bit signed halfwords in each 32-bit chunk +#[inline] +pub fn sunpkd820(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xAC9", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Unpack third and zeroth into two 16-bit signed halfwords in each 32-bit chunk +#[inline] +pub fn sunpkd830(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xACA", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Unpack third and first into two 16-bit signed halfwords in each 32-bit chunk +#[inline] +pub fn sunpkd831(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xACB", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Unpack third and second into two 16-bit signed halfwords in each 32-bit chunk +#[inline] +pub fn sunpkd832(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xAD3", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Unpack first and zeroth into two 16-bit unsigned halfwords in each 32-bit chunk +#[inline] +pub fn zunpkd810(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xACC", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Unpack second and zeroth into two 16-bit unsigned halfwords in each 32-bit chunk +#[inline] +pub fn zunpkd820(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xACD", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Unpack third and zeroth into two 16-bit unsigned halfwords in each 32-bit chunk +#[inline] +pub fn zunpkd830(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xACE", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Unpack third and first into two 16-bit unsigned halfwords in each 32-bit chunk +#[inline] +pub fn zunpkd831(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xACF", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Unpack third and second into two 16-bit unsigned halfwords in each 32-bit chunk +#[inline] +pub fn zunpkd832(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xAD7", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +// todo: pkbb16, pktt16 + +/// Pack two 16-bit data from bottom and top half from 32-bit chunks +#[inline] +pub fn pkbt16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x1, 0x0F, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Pack two 16-bit data from top and bottom half from 32-bit chunks +#[inline] +pub fn pktb16(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x1, 0x1F, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Count the number of redundant sign bits of the packed 32-bit elements +#[inline] +pub fn clrs32(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xAF8", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Count the number of leading zero bits of the packed 32-bit elements +#[inline] +pub fn clz32(a: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn i 0x77, 0x0, {}, {}, 0xAF9", lateout(reg) value, in(reg) a, options(pure, nomem, nostack)) + } + value +} + +/// Calculate the sum of absolute difference of unsigned 8-bit data elements +#[inline] +pub fn pbsad(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x7E, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Calculate and accumulate the sum of absolute difference of unsigned 8-bit data elements +#[inline] +pub fn pbsada(t: usize, a: usize, b: usize) -> usize { + let mut value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x7F, {}, {}, {}", inlateout(reg) t => value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Multiply signed 8-bit elements and add 16-bit elements on results for packed 32-bit chunks +#[inline] +pub fn smaqa(t: usize, a: usize, b: usize) -> usize { + let mut value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x64, {}, {}, {}", inlateout(reg) t => value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Multiply unsigned 8-bit elements and add 16-bit elements on results for packed 32-bit chunks +#[inline] +pub fn umaqa(t: usize, a: usize, b: usize) -> usize { + let mut value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x66, {}, {}, {}", inlateout(reg) t => value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Multiply signed to unsigned 8-bit and add 16-bit elements on results for packed 32-bit chunks +#[inline] +pub fn smaqasu(t: usize, a: usize, b: usize) -> usize { + let mut value: usize; + unsafe { + asm!(".insn r 0x77, 0x0, 0x65, {}, {}, {}", inlateout(reg) t => value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Adds signed lower 16-bit content of two registers with Q15 saturation +#[inline] +pub fn kaddh(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x1, 0x02, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Subtracts signed lower 16-bit content of two registers with Q15 saturation +#[inline] +pub fn ksubh(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x1, 0x03, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Adds signed lower 16-bit content of two registers with U16 saturation +#[inline] +pub fn ukaddh(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x1, 0x0A, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} + +/// Subtracts signed lower 16-bit content of two registers with U16 saturation +#[inline] +pub fn uksubh(a: usize, b: usize) -> usize { + let value: usize; + unsafe { + asm!(".insn r 0x77, 0x1, 0x0B, {}, {}, {}", lateout(reg) value, in(reg) a, in(reg) b, options(pure, nomem, nostack)) + } + value +} diff --git a/library/stdarch/crates/core_arch/src/simd_llvm.rs b/library/stdarch/crates/core_arch/src/simd_llvm.rs index 1970e5c69..decdecaaf 100644 --- a/library/stdarch/crates/core_arch/src/simd_llvm.rs +++ b/library/stdarch/crates/core_arch/src/simd_llvm.rs @@ -9,13 +9,7 @@ extern "platform-intrinsic" { pub fn simd_gt<T, U>(x: T, y: T) -> U; pub fn simd_ge<T, U>(x: T, y: T) -> U; - pub fn simd_shuffle2<T, U>(x: T, y: T, idx: [u32; 2]) -> U; - pub fn simd_shuffle4<T, U>(x: T, y: T, idx: [u32; 4]) -> U; - pub fn simd_shuffle8<T, U>(x: T, y: T, idx: [u32; 8]) -> U; - pub fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U; - pub fn simd_shuffle32<T, U>(x: T, y: T, idx: [u32; 32]) -> U; - pub fn simd_shuffle64<T, U>(x: T, y: T, idx: [u32; 64]) -> U; - pub fn simd_shuffle128<T, U>(x: T, y: T, idx: [u32; 128]) -> U; + pub fn simd_shuffle<T, U, V>(x: T, y: T, idx: U) -> V; #[rustc_const_unstable(feature = "const_simd_insert", issue = "none")] pub fn simd_insert<T, U>(x: T, idx: u32, val: U) -> T; diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs index 24f9c0301..16add3dbb 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx2.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs @@ -2001,7 +2001,7 @@ pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpmovmskb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 { - pmovmskb(a.as_i8x32()) + simd_bitmask::<_, u32>(a.as_i8x32()) as i32 } /// Computes the sum of absolute differences (SADs) of quadruplets of unsigned @@ -3642,8 +3642,6 @@ extern "C" { fn pminud(a: u32x8, b: u32x8) -> u32x8; #[link_name = "llvm.x86.avx2.pminu.b"] fn pminub(a: u8x32, b: u8x32) -> u8x32; - #[link_name = "llvm.x86.avx2.pmovmskb"] - fn pmovmskb(a: i8x32) -> i32; #[link_name = "llvm.x86.avx2.mpsadbw"] fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16; #[link_name = "llvm.x86.avx2.pmulhu.w"] diff --git a/library/stdarch/crates/core_arch/src/x86/cpuid.rs b/library/stdarch/crates/core_arch/src/x86/cpuid.rs index 6b90295ef..2624e8bdf 100644 --- a/library/stdarch/crates/core_arch/src/x86/cpuid.rs +++ b/library/stdarch/crates/core_arch/src/x86/cpuid.rs @@ -62,27 +62,27 @@ pub unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> CpuidResult { #[cfg(target_arch = "x86")] { asm!( - "movl %ebx, {0}", + "mov {0}, ebx", "cpuid", - "xchgl %ebx, {0}", - lateout(reg) ebx, - inlateout("eax") leaf => eax, - inlateout("ecx") sub_leaf => ecx, - lateout("edx") edx, - options(nostack, preserves_flags, att_syntax), + "xchg {0}, ebx", + out(reg) ebx, + inout("eax") leaf => eax, + inout("ecx") sub_leaf => ecx, + out("edx") edx, + options(nostack, preserves_flags), ); } #[cfg(target_arch = "x86_64")] { asm!( - "movq %rbx, {0:r}", + "mov {0:r}, rbx", "cpuid", - "xchgq %rbx, {0:r}", - lateout(reg) ebx, - inlateout("eax") leaf => eax, - inlateout("ecx") sub_leaf => ecx, - lateout("edx") edx, - options(nostack, preserves_flags, att_syntax), + "xchg {0:r}, rbx", + out(reg) ebx, + inout("eax") leaf => eax, + inout("ecx") sub_leaf => ecx, + out("edx") edx, + options(nostack, preserves_flags), ); } CpuidResult { eax, ebx, ecx, edx } diff --git a/library/stdarch/crates/core_arch/src/x86/mod.rs b/library/stdarch/crates/core_arch/src/x86/mod.rs index 547bfe67d..6b50e95b2 100644 --- a/library/stdarch/crates/core_arch/src/x86/mod.rs +++ b/library/stdarch/crates/core_arch/src/x86/mod.rs @@ -306,7 +306,7 @@ types! { /// 256-bit wide set of 16 'u16' types, x86-specific /// - /// This type is the same as the `__m128bh` type defined by Intel, + /// This type is the same as the `__m256bh` type defined by Intel, /// representing a 256-bit SIMD register which internally is consisted of /// 16 packed `u16` instances. Its purpose is for bf16 related intrinsic /// implementations. @@ -317,7 +317,7 @@ types! { /// 512-bit wide set of 32 'u16' types, x86-specific /// - /// This type is the same as the `__m128bh` type defined by Intel, + /// This type is the same as the `__m512bh` type defined by Intel, /// representing a 512-bit SIMD register which internally is consisted of /// 32 packed `u16` instances. Its purpose is for bf16 related intrinsic /// implementations. diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs index d82b8641f..3e79b3539 100644 --- a/library/stdarch/crates/core_arch/src/x86/sse2.rs +++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs @@ -1378,7 +1378,7 @@ pub unsafe fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i { #[cfg_attr(test, assert_instr(pmovmskb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 { - pmovmskb(a.as_i8x16()) + simd_bitmask::<_, u16>(a.as_i8x16()) as u32 as i32 } /// Shuffles 32-bit integers in `a` using the control in `IMM8`. @@ -2856,8 +2856,6 @@ extern "C" { fn packssdw(a: i32x4, b: i32x4) -> i16x8; #[link_name = "llvm.x86.sse2.packuswb.128"] fn packuswb(a: i16x8, b: i16x8) -> u8x16; - #[link_name = "llvm.x86.sse2.pmovmskb.128"] - fn pmovmskb(a: i8x16) -> i32; #[link_name = "llvm.x86.sse2.max.sd"] fn maxsd(a: __m128d, b: __m128d) -> __m128d; #[link_name = "llvm.x86.sse2.max.pd"] diff --git a/library/stdarch/crates/core_arch/src/x86/sse3.rs b/library/stdarch/crates/core_arch/src/x86/sse3.rs index ab0dd38fe..61f8a4e78 100644 --- a/library/stdarch/crates/core_arch/src/x86/sse3.rs +++ b/library/stdarch/crates/core_arch/src/x86/sse3.rs @@ -1,11 +1,7 @@ //! Streaming SIMD Extensions 3 (SSE3) use crate::{ - core_arch::{ - simd::*, - simd_llvm::{simd_shuffle2, simd_shuffle4}, - x86::*, - }, + core_arch::{simd::*, simd_llvm::simd_shuffle, x86::*}, mem::transmute, }; diff --git a/library/stdarch/crates/std_detect/Cargo.toml b/library/stdarch/crates/std_detect/Cargo.toml index 1ca0d9c5d..3a482564e 100644 --- a/library/stdarch/crates/std_detect/Cargo.toml +++ b/library/stdarch/crates/std_detect/Cargo.toml @@ -22,7 +22,7 @@ maintenance = { status = "experimental" } [dependencies] libc = { version = "0.2", optional = true, default-features = false } -cfg-if = "0.1.10" +cfg-if = "1.0.0" # When built as part of libstd core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" } diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/aarch64.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/aarch64.rs index b6a2e5218..6c79ba86d 100644 --- a/library/stdarch/crates/std_detect/src/detect/os/linux/aarch64.rs +++ b/library/stdarch/crates/std_detect/src/detect/os/linux/aarch64.rs @@ -23,58 +23,62 @@ pub(crate) fn detect_features() -> cache::Initializer { /// The names match those used for cpuinfo. /// /// [hwcap]: https://github.com/torvalds/linux/blob/master/arch/arm64/include/uapi/asm/hwcap.h +#[derive(Debug, Default, PartialEq)] struct AtHwcap { - fp: bool, // 0 - asimd: bool, // 1 - // evtstrm: bool, // 2 No LLVM support - aes: bool, // 3 - pmull: bool, // 4 - sha1: bool, // 5 - sha2: bool, // 6 - crc32: bool, // 7 - atomics: bool, // 8 - fphp: bool, // 9 - asimdhp: bool, // 10 - // cpuid: bool, // 11 No LLVM support - asimdrdm: bool, // 12 - jscvt: bool, // 13 - fcma: bool, // 14 - lrcpc: bool, // 15 - dcpop: bool, // 16 - sha3: bool, // 17 - sm3: bool, // 18 - sm4: bool, // 19 - asimddp: bool, // 20 - sha512: bool, // 21 - sve: bool, // 22 - fhm: bool, // 23 - dit: bool, // 24 - uscat: bool, // 25 - ilrcpc: bool, // 26 - flagm: bool, // 27 - ssbs: bool, // 28 - sb: bool, // 29 - paca: bool, // 30 - pacg: bool, // 31 - dcpodp: bool, // 32 - sve2: bool, // 33 - sveaes: bool, // 34 - // svepmull: bool, // 35 No LLVM support - svebitperm: bool, // 36 - svesha3: bool, // 37 - svesm4: bool, // 38 - // flagm2: bool, // 39 No LLVM support - frint: bool, // 40 - // svei8mm: bool, // 41 See i8mm feature - svef32mm: bool, // 42 - svef64mm: bool, // 43 - // svebf16: bool, // 44 See bf16 feature - i8mm: bool, // 45 - bf16: bool, // 46 - // dgh: bool, // 47 No LLVM support - rng: bool, // 48 - bti: bool, // 49 - mte: bool, // 50 + // AT_HWCAP + fp: bool, + asimd: bool, + // evtstrm: No LLVM support. + aes: bool, + pmull: bool, + sha1: bool, + sha2: bool, + crc32: bool, + atomics: bool, + fphp: bool, + asimdhp: bool, + // cpuid: No LLVM support. + asimdrdm: bool, + jscvt: bool, + fcma: bool, + lrcpc: bool, + dcpop: bool, + sha3: bool, + sm3: bool, + sm4: bool, + asimddp: bool, + sha512: bool, + sve: bool, + fhm: bool, + dit: bool, + uscat: bool, + ilrcpc: bool, + flagm: bool, + ssbs: bool, + sb: bool, + paca: bool, + pacg: bool, + + // AT_HWCAP2 + dcpodp: bool, + sve2: bool, + sveaes: bool, + // svepmull: No LLVM support. + svebitperm: bool, + svesha3: bool, + svesm4: bool, + // flagm2: No LLVM support. + frint: bool, + // svei8mm: See i8mm feature. + svef32mm: bool, + svef64mm: bool, + // svebf16: See bf16 feature. + i8mm: bool, + bf16: bool, + // dgh: No LLVM support. + rng: bool, + bti: bool, + mte: bool, } impl From<auxvec::AuxVec> for AtHwcap { @@ -113,25 +117,25 @@ impl From<auxvec::AuxVec> for AtHwcap { sb: bit::test(auxv.hwcap, 29), paca: bit::test(auxv.hwcap, 30), pacg: bit::test(auxv.hwcap, 31), - dcpodp: bit::test(auxv.hwcap, 32), - sve2: bit::test(auxv.hwcap, 33), - sveaes: bit::test(auxv.hwcap, 34), - // svepmull: bit::test(auxv.hwcap, 35), - svebitperm: bit::test(auxv.hwcap, 36), - svesha3: bit::test(auxv.hwcap, 37), - svesm4: bit::test(auxv.hwcap, 38), - // flagm2: bit::test(auxv.hwcap, 39), - frint: bit::test(auxv.hwcap, 40), - // svei8mm: bit::test(auxv.hwcap, 41), - svef32mm: bit::test(auxv.hwcap, 42), - svef64mm: bit::test(auxv.hwcap, 43), - // svebf16: bit::test(auxv.hwcap, 44), - i8mm: bit::test(auxv.hwcap, 45), - bf16: bit::test(auxv.hwcap, 46), - // dgh: bit::test(auxv.hwcap, 47), - rng: bit::test(auxv.hwcap, 48), - bti: bit::test(auxv.hwcap, 49), - mte: bit::test(auxv.hwcap, 50), + dcpodp: bit::test(auxv.hwcap2, 0), + sve2: bit::test(auxv.hwcap2, 1), + sveaes: bit::test(auxv.hwcap2, 2), + // svepmull: bit::test(auxv.hwcap2, 3), + svebitperm: bit::test(auxv.hwcap2, 4), + svesha3: bit::test(auxv.hwcap2, 5), + svesm4: bit::test(auxv.hwcap2, 6), + // flagm2: bit::test(auxv.hwcap2, 7), + frint: bit::test(auxv.hwcap2, 8), + // svei8mm: bit::test(auxv.hwcap2, 9), + svef32mm: bit::test(auxv.hwcap2, 10), + svef64mm: bit::test(auxv.hwcap2, 11), + // svebf16: bit::test(auxv.hwcap2, 12), + i8mm: bit::test(auxv.hwcap2, 13), + bf16: bit::test(auxv.hwcap2, 14), + // dgh: bit::test(auxv.hwcap2, 15), + rng: bit::test(auxv.hwcap2, 16), + bti: bit::test(auxv.hwcap2, 17), + mte: bit::test(auxv.hwcap2, 18), } } } @@ -288,3 +292,86 @@ impl AtHwcap { value } } + +#[cfg(test)] +mod tests { + use super::*; + + #[cfg(feature = "std_detect_file_io")] + mod auxv_from_file { + use super::auxvec::auxv_from_file; + use super::*; + // The baseline hwcaps used in the (artificial) auxv test files. + fn baseline_hwcaps() -> AtHwcap { + AtHwcap { + fp: true, + asimd: true, + aes: true, + pmull: true, + sha1: true, + sha2: true, + crc32: true, + atomics: true, + fphp: true, + asimdhp: true, + asimdrdm: true, + lrcpc: true, + dcpop: true, + asimddp: true, + ssbs: true, + ..AtHwcap::default() + } + } + + #[test] + fn linux_empty_hwcap2_aarch64() { + let file = concat!( + env!("CARGO_MANIFEST_DIR"), + "/src/detect/test_data/linux-empty-hwcap2-aarch64.auxv" + ); + println!("file: {}", file); + let v = auxv_from_file(file).unwrap(); + println!("HWCAP : 0x{:0x}", v.hwcap); + println!("HWCAP2: 0x{:0x}", v.hwcap2); + assert_eq!(AtHwcap::from(v), baseline_hwcaps()); + } + #[test] + fn linux_no_hwcap2_aarch64() { + let file = concat!( + env!("CARGO_MANIFEST_DIR"), + "/src/detect/test_data/linux-no-hwcap2-aarch64.auxv" + ); + println!("file: {}", file); + let v = auxv_from_file(file).unwrap(); + println!("HWCAP : 0x{:0x}", v.hwcap); + println!("HWCAP2: 0x{:0x}", v.hwcap2); + assert_eq!(AtHwcap::from(v), baseline_hwcaps()); + } + #[test] + fn linux_hwcap2_aarch64() { + let file = concat!( + env!("CARGO_MANIFEST_DIR"), + "/src/detect/test_data/linux-hwcap2-aarch64.auxv" + ); + println!("file: {}", file); + let v = auxv_from_file(file).unwrap(); + println!("HWCAP : 0x{:0x}", v.hwcap); + println!("HWCAP2: 0x{:0x}", v.hwcap2); + assert_eq!( + AtHwcap::from(v), + AtHwcap { + // Some other HWCAP bits. + paca: true, + pacg: true, + // HWCAP2-only bits. + dcpodp: true, + frint: true, + rng: true, + bti: true, + mte: true, + ..baseline_hwcaps() + } + ); + } + } +} diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs index e6447d0cd..c903903bd 100644 --- a/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs +++ b/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs @@ -7,6 +7,7 @@ pub(crate) const AT_NULL: usize = 0; pub(crate) const AT_HWCAP: usize = 16; /// Key to access the CPU Hardware capabilities 2 bitfield. #[cfg(any( + target_arch = "aarch64", target_arch = "arm", target_arch = "powerpc", target_arch = "powerpc64" @@ -21,6 +22,7 @@ pub(crate) const AT_HWCAP2: usize = 26; pub(crate) struct AuxVec { pub hwcap: usize, #[cfg(any( + target_arch = "aarch64", target_arch = "arm", target_arch = "powerpc", target_arch = "powerpc64" @@ -64,13 +66,14 @@ pub(crate) fn auxv() -> Result<AuxVec, ()> { if let Ok(hwcap) = getauxval(AT_HWCAP) { // Targets with only AT_HWCAP: #[cfg(any( - target_arch = "aarch64", target_arch = "riscv32", target_arch = "riscv64", target_arch = "mips", target_arch = "mips64" ))] { + // Zero could indicate that no features were detected, but it's also used to + // indicate an error. In either case, try the fallback. if hwcap != 0 { return Ok(AuxVec { hwcap }); } @@ -78,13 +81,18 @@ pub(crate) fn auxv() -> Result<AuxVec, ()> { // Targets with AT_HWCAP and AT_HWCAP2: #[cfg(any( + target_arch = "aarch64", target_arch = "arm", target_arch = "powerpc", target_arch = "powerpc64" ))] { if let Ok(hwcap2) = getauxval(AT_HWCAP2) { - if hwcap != 0 && hwcap2 != 0 { + // Zero could indicate that no features were detected, but it's also used to + // indicate an error. In particular, on many platforms AT_HWCAP2 will be + // legitimately zero, since it contains the most recent feature flags. Use the + // fallback only if no features were detected at all. + if hwcap != 0 || hwcap2 != 0 { return Ok(AuxVec { hwcap, hwcap2 }); } } @@ -97,7 +105,6 @@ pub(crate) fn auxv() -> Result<AuxVec, ()> { { // Targets with only AT_HWCAP: #[cfg(any( - target_arch = "aarch64", target_arch = "riscv32", target_arch = "riscv64", target_arch = "mips", @@ -105,6 +112,8 @@ pub(crate) fn auxv() -> Result<AuxVec, ()> { ))] { let hwcap = unsafe { libc::getauxval(AT_HWCAP as libc::c_ulong) as usize }; + // Zero could indicate that no features were detected, but it's also used to indicate + // an error. In either case, try the fallback. if hwcap != 0 { return Ok(AuxVec { hwcap }); } @@ -112,6 +121,7 @@ pub(crate) fn auxv() -> Result<AuxVec, ()> { // Targets with AT_HWCAP and AT_HWCAP2: #[cfg(any( + target_arch = "aarch64", target_arch = "arm", target_arch = "powerpc", target_arch = "powerpc64" @@ -119,7 +129,11 @@ pub(crate) fn auxv() -> Result<AuxVec, ()> { { let hwcap = unsafe { libc::getauxval(AT_HWCAP as libc::c_ulong) as usize }; let hwcap2 = unsafe { libc::getauxval(AT_HWCAP2 as libc::c_ulong) as usize }; - if hwcap != 0 && hwcap2 != 0 { + // Zero could indicate that no features were detected, but it's also used to indicate + // an error. In particular, on many platforms AT_HWCAP2 will be legitimately zero, + // since it contains the most recent feature flags. Use the fallback only if no + // features were detected at all. + if hwcap != 0 || hwcap2 != 0 { return Ok(AuxVec { hwcap, hwcap2 }); } } @@ -158,7 +172,7 @@ fn getauxval(key: usize) -> Result<usize, ()> { /// Tries to read the auxiliary vector from the `file`. If this fails, this /// function returns `Err`. #[cfg(feature = "std_detect_file_io")] -fn auxv_from_file(file: &str) -> Result<AuxVec, ()> { +pub(super) fn auxv_from_file(file: &str) -> Result<AuxVec, ()> { let file = super::read_file(file)?; // See <https://github.com/torvalds/linux/blob/v3.19/include/uapi/linux/auxvec.h>. @@ -181,7 +195,6 @@ fn auxv_from_file(file: &str) -> Result<AuxVec, ()> { fn auxv_from_buf(buf: &[usize; 64]) -> Result<AuxVec, ()> { // Targets with only AT_HWCAP: #[cfg(any( - target_arch = "aarch64", target_arch = "riscv32", target_arch = "riscv64", target_arch = "mips", @@ -198,23 +211,25 @@ fn auxv_from_buf(buf: &[usize; 64]) -> Result<AuxVec, ()> { } // Targets with AT_HWCAP and AT_HWCAP2: #[cfg(any( + target_arch = "aarch64", target_arch = "arm", target_arch = "powerpc", target_arch = "powerpc64" ))] { let mut hwcap = None; - let mut hwcap2 = None; + // For some platforms, AT_HWCAP2 was added recently, so let it default to zero. + let mut hwcap2 = 0; for el in buf.chunks(2) { match el[0] { AT_NULL => break, AT_HWCAP => hwcap = Some(el[1]), - AT_HWCAP2 => hwcap2 = Some(el[1]), + AT_HWCAP2 => hwcap2 = el[1], _ => (), } } - if let (Some(hwcap), Some(hwcap2)) = (hwcap, hwcap2) { + if let Some(hwcap) = hwcap { return Ok(AuxVec { hwcap, hwcap2 }); } } @@ -256,7 +271,6 @@ mod tests { // FIXME: on mips/mips64 getauxval returns 0, and /proc/self/auxv // does not always contain the AT_HWCAP key under qemu. #[cfg(any( - target_arch = "aarch64", target_arch = "arm", target_arch = "powerpc", target_arch = "powerpc64" @@ -271,6 +285,7 @@ mod tests { // Targets with AT_HWCAP and AT_HWCAP2: #[cfg(any( + target_arch = "aarch64", target_arch = "arm", target_arch = "powerpc", target_arch = "powerpc64" @@ -305,24 +320,31 @@ mod tests { } #[test] - #[should_panic] fn linux_macos_vb() { let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/macos-virtualbox-linux-x86-4850HQ.auxv"); println!("file: {}", file); + // The file contains HWCAP but not HWCAP2. In that case, we treat HWCAP2 as zero. let v = auxv_from_file(file).unwrap(); - // this file is incomplete (contains hwcap but not hwcap2), we - // want to fall back to /proc/cpuinfo in this case, so - // reading should fail. assert_eq!(v.hwcap, 126614527); - // assert_eq!(v.hwcap2, 0); - let _ = v; + assert_eq!(v.hwcap, 126614527); + assert_eq!(v.hwcap2, 0); } } else if #[cfg(target_arch = "aarch64")] { #[test] - fn linux_x64() { - let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/linux-x64-i7-6850k.auxv"); + fn linux_artificial_aarch64() { + let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/linux-artificial-aarch64.auxv"); println!("file: {}", file); let v = auxv_from_file(file).unwrap(); - assert_eq!(v.hwcap, 3219913727); + assert_eq!(v.hwcap, 0x0123456789abcdef); + assert_eq!(v.hwcap2, 0x02468ace13579bdf); + } + #[test] + fn linux_no_hwcap2_aarch64() { + let file = concat!(env!("CARGO_MANIFEST_DIR"), "/src/detect/test_data/linux-no-hwcap2-aarch64.auxv"); + println!("file: {}", file); + let v = auxv_from_file(file).unwrap(); + // An absent HWCAP2 is treated as zero, and does not prevent acceptance of HWCAP. + assert_ne!(v.hwcap, 0); + assert_eq!(v.hwcap2, 0); } } } @@ -353,6 +375,7 @@ mod tests { // Targets with AT_HWCAP and AT_HWCAP2: #[cfg(any( + target_arch = "aarch64", target_arch = "arm", target_arch = "powerpc", target_arch = "powerpc64" diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-artificial-aarch64.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-artificial-aarch64.auxv Binary files differnew file mode 100644 index 000000000..ec826afcf --- /dev/null +++ b/library/stdarch/crates/std_detect/src/detect/test_data/linux-artificial-aarch64.auxv diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-empty-hwcap2-aarch64.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-empty-hwcap2-aarch64.auxv Binary files differnew file mode 100644 index 000000000..95537b73f --- /dev/null +++ b/library/stdarch/crates/std_detect/src/detect/test_data/linux-empty-hwcap2-aarch64.auxv diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-hwcap2-aarch64.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-hwcap2-aarch64.auxv Binary files differnew file mode 100644 index 000000000..1d87264b2 --- /dev/null +++ b/library/stdarch/crates/std_detect/src/detect/test_data/linux-hwcap2-aarch64.auxv diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-no-hwcap2-aarch64.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-no-hwcap2-aarch64.auxv Binary files differnew file mode 100644 index 000000000..35f01cc76 --- /dev/null +++ b/library/stdarch/crates/std_detect/src/detect/test_data/linux-no-hwcap2-aarch64.auxv diff --git a/library/stdarch/crates/std_detect/src/detect/test_data/linux-x64-i7-6850k.auxv b/library/stdarch/crates/std_detect/src/detect/test_data/linux-x64-i7-6850k.auxv Binary files differdeleted file mode 100644 index 6afe1b3b4..000000000 --- a/library/stdarch/crates/std_detect/src/detect/test_data/linux-x64-i7-6850k.auxv +++ /dev/null diff --git a/library/stdarch/crates/stdarch-test/Cargo.toml b/library/stdarch/crates/stdarch-test/Cargo.toml index 9ac1057be..012b4e959 100644 --- a/library/stdarch/crates/stdarch-test/Cargo.toml +++ b/library/stdarch/crates/stdarch-test/Cargo.toml @@ -10,7 +10,7 @@ simd-test-macro = { path = "../simd-test-macro" } cc = "1.0" lazy_static = "1.0" rustc-demangle = "0.1.8" -cfg-if = "0.1" +cfg-if = "1.0" # We use a crates.io dependency to disassemble wasm binaries to look for # instructions for `#[assert_instr]`. Note that we use an `=` dependency here diff --git a/library/stdarch/crates/stdarch-test/src/lib.rs b/library/stdarch/crates/stdarch-test/src/lib.rs index 078736c66..eba17771c 100644 --- a/library/stdarch/crates/stdarch-test/src/lib.rs +++ b/library/stdarch/crates/stdarch-test/src/lib.rs @@ -3,7 +3,6 @@ //! This basically just disassembles the current executable and then parses the //! output once globally and then provides the `assert` function which makes //! assertions about the disassembly of a function. -#![feature(bench_black_box)] // For black_box #![deny(rust_2018_idioms)] #![allow(clippy::missing_docs_in_private_items, clippy::print_stdout)] |