From ef24de24a82fe681581cc130f342363c47c0969a Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 7 Jun 2024 07:48:48 +0200 Subject: Merging upstream version 1.75.0+dfsg1. Signed-off-by: Daniel Baumann --- .../stdarch/crates/assert-instr-macro/Cargo.toml | 2 +- .../stdarch/crates/assert-instr-macro/src/lib.rs | 13 +- .../core_arch/src/arm_shared/barrier/cp15.rs | 9 +- .../src/arm_shared/neon/shift_and_insert_tests.rs | 4 +- library/stdarch/crates/core_arch/src/riscv64/zk.rs | 57 ++-- .../crates/core_arch/src/riscv_shared/zb.rs | 12 +- .../crates/core_arch/src/riscv_shared/zk.rs | 30 +- library/stdarch/crates/core_arch/src/x86/avx.rs | 40 +-- library/stdarch/crates/core_arch/src/x86/avx2.rs | 46 +-- .../crates/core_arch/src/x86/avx512bitalg.rs | 12 +- .../stdarch/crates/core_arch/src/x86/avx512bw.rs | 104 +++---- .../stdarch/crates/core_arch/src/x86/avx512f.rs | 333 ++++++++------------- library/stdarch/crates/core_arch/src/x86/sse.rs | 186 +++++++++--- library/stdarch/crates/core_arch/src/x86/sse2.rs | 309 ++++++++++++------- library/stdarch/crates/core_arch/src/x86/sse3.rs | 18 +- library/stdarch/crates/core_arch/src/x86/sse41.rs | 72 +++-- library/stdarch/crates/core_arch/src/x86/test.rs | 9 + .../stdarch/crates/core_arch/src/x86_64/avx512f.rs | 60 ++-- .../stdarch/crates/core_arch/src/x86_64/sse2.rs | 3 + library/stdarch/crates/intrinsic-test/Cargo.toml | 6 +- library/stdarch/crates/intrinsic-test/README.md | 4 +- .../crates/intrinsic-test/src/json_parser.rs | 3 +- library/stdarch/crates/intrinsic-test/src/main.rs | 93 +++--- library/stdarch/crates/simd-test-macro/Cargo.toml | 1 + library/stdarch/crates/simd-test-macro/src/lib.rs | 39 +-- library/stdarch/crates/std_detect/Cargo.toml | 1 - .../std_detect/src/detect/os/linux/auxvec.rs | 59 +--- .../stdarch/crates/std_detect/src/detect/os/x86.rs | 6 +- library/stdarch/crates/stdarch-test/Cargo.toml | 2 +- .../stdarch/crates/stdarch-test/src/disassembly.rs | 2 + library/stdarch/crates/stdarch-verify/Cargo.toml | 4 +- library/stdarch/crates/stdarch-verify/src/lib.rs | 82 +++-- 32 files changed, 843 insertions(+), 778 deletions(-) (limited to 'library/stdarch/crates') diff --git a/library/stdarch/crates/assert-instr-macro/Cargo.toml b/library/stdarch/crates/assert-instr-macro/Cargo.toml index 4ad654e69..881c8109c 100644 --- a/library/stdarch/crates/assert-instr-macro/Cargo.toml +++ b/library/stdarch/crates/assert-instr-macro/Cargo.toml @@ -11,4 +11,4 @@ test = false [dependencies] proc-macro2 = "1.0" quote = "1.0" -syn = { version = "1.0", features = ["full"] } +syn = { version = "2.0", features = ["full"] } diff --git a/library/stdarch/crates/assert-instr-macro/src/lib.rs b/library/stdarch/crates/assert-instr-macro/src/lib.rs index 99e37c910..c9de43943 100644 --- a/library/stdarch/crates/assert-instr-macro/src/lib.rs +++ b/library/stdarch/crates/assert-instr-macro/src/lib.rs @@ -35,6 +35,15 @@ pub fn assert_instr( let instr = &invoc.instr; let name = &func.sig.ident; + let maybe_allow_deprecated = if func + .attrs + .iter() + .any(|attr| attr.path().is_ident("deprecated")) + { + quote! { #[allow(deprecated)] } + } else { + quote! {} + }; // Disable assert_instr for x86 targets compiled with avx enabled, which // causes LLVM to generate different intrinsics that the ones we are @@ -108,7 +117,7 @@ pub fn assert_instr( .attrs .iter() .filter(|attr| { - attr.path + attr.path() .segments .first() .expect("attr.path.segments.first() failed") @@ -135,6 +144,7 @@ pub fn assert_instr( let to_test = if disable_dedup_guard { quote! { #attrs + #maybe_allow_deprecated #[no_mangle] #[inline(never)] pub unsafe extern #abi fn #shim_name(#(#inputs),*) #ret { @@ -147,6 +157,7 @@ pub fn assert_instr( const #shim_name_ptr : *const u8 = #shim_name_str.as_ptr(); #attrs + #maybe_allow_deprecated #[no_mangle] #[inline(never)] pub unsafe extern #abi fn #shim_name(#(#inputs),*) #ret { diff --git a/library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs b/library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs index 6faae0fee..fe540a7d8 100644 --- a/library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs +++ b/library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs @@ -11,7 +11,8 @@ impl super::super::sealed::Dmb for SY { #[inline(always)] unsafe fn __dmb(&self) { asm!( - "mcr p15, 0, r0, c7, c10, 5", + "mcr p15, 0, {}, c7, c10, 5", + in(reg) 0_u32, options(preserves_flags, nostack) ) } @@ -21,7 +22,8 @@ impl super::super::sealed::Dsb for SY { #[inline(always)] unsafe fn __dsb(&self) { asm!( - "mcr p15, 0, r0, c7, c10, 4", + "mcr p15, 0, {}, c7, c10, 4", + in(reg) 0_u32, options(preserves_flags, nostack) ) } @@ -31,7 +33,8 @@ impl super::super::sealed::Isb for SY { #[inline(always)] unsafe fn __isb(&self) { asm!( - "mcr p15, 0, r0, c7, c5, 4", + "mcr p15, 0, {}, c7, c5, 4", + in(reg) 0_u32, options(preserves_flags, nostack) ) } diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs index ebb8b7b9e..54bffa450 100644 --- a/library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs +++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs @@ -22,7 +22,7 @@ macro_rules! test_vsli { let a = [$($a as $t),*]; let b = [$($b as $t),*]; let n_bit_mask: $t = (1 << $n) - 1; - let e = [$(($a as $t & n_bit_mask) | ($b as $t << $n)),*]; + let e = [$(($a as $t & n_bit_mask) | (($b as $t) << $n)),*]; let r = $fn_id::<$n>(transmute(a), transmute(b)); let mut d = e; d = transmute(r); @@ -60,7 +60,7 @@ macro_rules! test_vsri { unsafe fn $test_id() { let a = [$($a as $t),*]; let b = [$($b as $t),*]; - let n_bit_mask = ((1 as $t << $n) - 1).rotate_right($n); + let n_bit_mask = (((1 as $t) << $n) - 1).rotate_right($n); let e = [$(($a as $t & n_bit_mask) | (($b as $t >> $n) & !n_bit_mask)),*]; let r = $fn_id::<$n>(transmute(a), transmute(b)); let mut d = e; diff --git a/library/stdarch/crates/core_arch/src/riscv64/zk.rs b/library/stdarch/crates/core_arch/src/riscv64/zk.rs index 3dbe3705d..9b403fc95 100644 --- a/library/stdarch/crates/core_arch/src/riscv64/zk.rs +++ b/library/stdarch/crates/core_arch/src/riscv64/zk.rs @@ -20,6 +20,9 @@ extern "unadjusted" { #[link_name = "llvm.riscv.aes64ks2"] fn _aes64ks2(rs1: i64, rs2: i64) -> i64; + #[link_name = "llvm.riscv.aes64im"] + fn _aes64im(rs1: i64) -> i64; + #[link_name = "llvm.riscv.sha512sig0"] fn _sha512sig0(rs1: i64) -> i64; @@ -50,8 +53,7 @@ extern "unadjusted" { /// /// This function is safe to use if the `zkne` target feature is present. #[target_feature(enable = "zkne")] -// See #1464 -// #[cfg_attr(test, assert_instr(aes64es))] +#[cfg_attr(test, assert_instr(aes64es))] #[inline] pub unsafe fn aes64es(rs1: u64, rs2: u64) -> u64 { _aes64es(rs1 as i64, rs2 as i64) as u64 @@ -74,8 +76,7 @@ pub unsafe fn aes64es(rs1: u64, rs2: u64) -> u64 { /// /// This function is safe to use if the `zkne` target feature is present. #[target_feature(enable = "zkne")] -// See #1464 -// #[cfg_attr(test, assert_instr(aes64esm))] +#[cfg_attr(test, assert_instr(aes64esm))] #[inline] pub unsafe fn aes64esm(rs1: u64, rs2: u64) -> u64 { _aes64esm(rs1 as i64, rs2 as i64) as u64 @@ -98,8 +99,7 @@ pub unsafe fn aes64esm(rs1: u64, rs2: u64) -> u64 { /// /// This function is safe to use if the `zknd` target feature is present. #[target_feature(enable = "zknd")] -// See #1464 -// #[cfg_attr(test, assert_instr(aes64ds))] +#[cfg_attr(test, assert_instr(aes64ds))] #[inline] pub unsafe fn aes64ds(rs1: u64, rs2: u64) -> u64 { _aes64ds(rs1 as i64, rs2 as i64) as u64 @@ -122,8 +122,7 @@ pub unsafe fn aes64ds(rs1: u64, rs2: u64) -> u64 { /// /// This function is safe to use if the `zknd` target feature is present. #[target_feature(enable = "zknd")] -// See #1464 -// #[cfg_attr(test, assert_instr(aes64dsm))] +#[cfg_attr(test, assert_instr(aes64dsm))] #[inline] pub unsafe fn aes64dsm(rs1: u64, rs2: u64) -> u64 { _aes64dsm(rs1 as i64, rs2 as i64) as u64 @@ -152,8 +151,7 @@ pub unsafe fn aes64dsm(rs1: u64, rs2: u64) -> u64 { /// This function is safe to use if the `zkne` or `zknd` target feature is present. #[target_feature(enable = "zkne", enable = "zknd")] #[rustc_legacy_const_generics(1)] -// See #1464 -// #[cfg_attr(test, assert_instr(aes64ks1i, RNUM = 0))] +#[cfg_attr(test, assert_instr(aes64ks1i, RNUM = 0))] #[inline] pub unsafe fn aes64ks1i(rs1: u64) -> u64 { static_assert!(RNUM <= 10); @@ -177,13 +175,36 @@ pub unsafe fn aes64ks1i(rs1: u64) -> u64 { /// /// This function is safe to use if the `zkne` or `zknd` target feature is present. #[target_feature(enable = "zkne", enable = "zknd")] -// See #1464 -// #[cfg_attr(test, assert_instr(aes64ks2))] +#[cfg_attr(test, assert_instr(aes64ks2))] #[inline] pub unsafe fn aes64ks2(rs1: u64, rs2: u64) -> u64 { _aes64ks2(rs1 as i64, rs2 as i64) as u64 } +/// This instruction accelerates the inverse MixColumns step of the AES Block Cipher, and is used to aid creation of +/// the decryption KeySchedule. +/// +/// The instruction applies the inverse MixColumns transformation to two columns of the state array, packed +/// into a single 64-bit register. It is used to create the inverse cipher KeySchedule, according to the equivalent +/// inverse cipher construction in (Page 23, Section 5.3.5). This instruction must always be implemented +/// such that its execution latency does not depend on the data being operated on. +/// +/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions +/// +/// Version: v1.0.1 +/// +/// Section: 3.9 +/// +/// # Safety +/// +/// This function is safe to use if the `zkne` or `zknd` target feature is present. +#[target_feature(enable = "zkne", enable = "zknd")] +#[cfg_attr(test, assert_instr(aes64im))] +#[inline] +pub unsafe fn aes64im(rs1: u64) -> u64 { + _aes64im(rs1 as i64) as u64 +} + /// Implements the Sigma0 transformation function as used in the SHA2-512 hash function \[49\] /// (Section 4.1.3). /// @@ -201,8 +222,7 @@ pub unsafe fn aes64ks2(rs1: u64, rs2: u64) -> u64 { /// /// This function is safe to use if the `zknh` target feature is present. #[target_feature(enable = "zknh")] -// See #1464 -// #[cfg_attr(test, assert_instr(sha512sig0))] +#[cfg_attr(test, assert_instr(sha512sig0))] #[inline] pub unsafe fn sha512sig0(rs1: u64) -> u64 { _sha512sig0(rs1 as i64) as u64 @@ -225,8 +245,7 @@ pub unsafe fn sha512sig0(rs1: u64) -> u64 { /// /// This function is safe to use if the `zknh` target feature is present. #[target_feature(enable = "zknh")] -// See #1464 -// #[cfg_attr(test, assert_instr(sha512sig1))] +#[cfg_attr(test, assert_instr(sha512sig1))] #[inline] pub unsafe fn sha512sig1(rs1: u64) -> u64 { _sha512sig1(rs1 as i64) as u64 @@ -249,8 +268,7 @@ pub unsafe fn sha512sig1(rs1: u64) -> u64 { /// /// This function is safe to use if the `zknh` target feature is present. #[target_feature(enable = "zknh")] -// See #1464 -// #[cfg_attr(test, assert_instr(sha512sum0))] +#[cfg_attr(test, assert_instr(sha512sum0))] #[inline] pub unsafe fn sha512sum0(rs1: u64) -> u64 { _sha512sum0(rs1 as i64) as u64 @@ -273,8 +291,7 @@ pub unsafe fn sha512sum0(rs1: u64) -> u64 { /// /// This function is safe to use if the `zknh` target feature is present. #[target_feature(enable = "zknh")] -// See #1464 -// #[cfg_attr(test, assert_instr(sha512sum1))] +#[cfg_attr(test, assert_instr(sha512sum1))] #[inline] pub unsafe fn sha512sum1(rs1: u64) -> u64 { _sha512sum1(rs1 as i64) as u64 diff --git a/library/stdarch/crates/core_arch/src/riscv_shared/zb.rs b/library/stdarch/crates/core_arch/src/riscv_shared/zb.rs index cfae6caa5..6785c04fd 100644 --- a/library/stdarch/crates/core_arch/src/riscv_shared/zb.rs +++ b/library/stdarch/crates/core_arch/src/riscv_shared/zb.rs @@ -47,8 +47,7 @@ extern "unadjusted" { /// /// This function is safe to use if the `zbb` target feature is present. #[target_feature(enable = "zbb")] -// See #1464 -// #[cfg_attr(test, assert_instr(orc.b))] +#[cfg_attr(test, assert_instr(orc.b))] #[inline] pub unsafe fn orc_b(rs: usize) -> usize { #[cfg(target_arch = "riscv32")] @@ -76,8 +75,7 @@ pub unsafe fn orc_b(rs: usize) -> usize { /// /// This function is safe to use if the `zbc` target feature is present. #[target_feature(enable = "zbc")] -// See #1464 -// #[cfg_attr(test, assert_instr(clmul))] +#[cfg_attr(test, assert_instr(clmul))] #[inline] pub unsafe fn clmul(rs1: usize, rs2: usize) -> usize { #[cfg(target_arch = "riscv32")] @@ -105,8 +103,7 @@ pub unsafe fn clmul(rs1: usize, rs2: usize) -> usize { /// /// This function is safe to use if the `zbc` target feature is present. #[target_feature(enable = "zbc")] -// See #1464 -// #[cfg_attr(test, assert_instr(clmulh))] +#[cfg_attr(test, assert_instr(clmulh))] #[inline] pub unsafe fn clmulh(rs1: usize, rs2: usize) -> usize { #[cfg(target_arch = "riscv32")] @@ -134,8 +131,7 @@ pub unsafe fn clmulh(rs1: usize, rs2: usize) -> usize { /// /// This function is safe to use if the `zbc` target feature is present. #[target_feature(enable = "zbc")] -// See #1464 -// #[cfg_attr(test, assert_instr(clmulr))] +#[cfg_attr(test, assert_instr(clmulr))] #[inline] pub unsafe fn clmulr(rs1: usize, rs2: usize) -> usize { #[cfg(target_arch = "riscv32")] diff --git a/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs b/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs index db97f72bc..5fc5b4cda 100644 --- a/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs +++ b/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs @@ -62,8 +62,7 @@ extern "unadjusted" { /// /// This function is safe to use if the `zbkx` target feature is present. #[target_feature(enable = "zbkx")] -// See #1464 -// #[cfg_attr(test, assert_instr(xperm8))] +#[cfg_attr(test, assert_instr(xperm8))] #[inline] pub unsafe fn xperm8(rs1: usize, rs2: usize) -> usize { #[cfg(target_arch = "riscv32")] @@ -94,8 +93,7 @@ pub unsafe fn xperm8(rs1: usize, rs2: usize) -> usize { /// /// This function is safe to use if the `zbkx` target feature is present. #[target_feature(enable = "zbkx")] -// See #1464 -// #[cfg_attr(test, assert_instr(xperm4))] +#[cfg_attr(test, assert_instr(xperm4))] #[inline] pub unsafe fn xperm4(rs1: usize, rs2: usize) -> usize { #[cfg(target_arch = "riscv32")] @@ -129,8 +127,7 @@ pub unsafe fn xperm4(rs1: usize, rs2: usize) -> usize { /// /// This function is safe to use if the `zknh` target feature is present. #[target_feature(enable = "zknh")] -// See #1464 -// #[cfg_attr(test, assert_instr(sha256sig0))] +#[cfg_attr(test, assert_instr(sha256sig0))] #[inline] pub unsafe fn sha256sig0(rs1: u32) -> u32 { _sha256sig0(rs1 as i32) as u32 @@ -156,8 +153,7 @@ pub unsafe fn sha256sig0(rs1: u32) -> u32 { /// /// This function is safe to use if the `zknh` target feature is present. #[target_feature(enable = "zknh")] -// See #1464 -// #[cfg_attr(test, assert_instr(sha256sig1))] +#[cfg_attr(test, assert_instr(sha256sig1))] #[inline] pub unsafe fn sha256sig1(rs1: u32) -> u32 { _sha256sig1(rs1 as i32) as u32 @@ -183,8 +179,7 @@ pub unsafe fn sha256sig1(rs1: u32) -> u32 { /// /// This function is safe to use if the `zknh` target feature is present. #[target_feature(enable = "zknh")] -// See #1464 -// #[cfg_attr(test, assert_instr(sha256sum0))] +#[cfg_attr(test, assert_instr(sha256sum0))] #[inline] pub unsafe fn sha256sum0(rs1: u32) -> u32 { _sha256sum0(rs1 as i32) as u32 @@ -210,8 +205,7 @@ pub unsafe fn sha256sum0(rs1: u32) -> u32 { /// /// This function is safe to use if the `zknh` target feature is present. #[target_feature(enable = "zknh")] -// See #1464 -// #[cfg_attr(test, assert_instr(sha256sum1))] +#[cfg_attr(test, assert_instr(sha256sum1))] #[inline] pub unsafe fn sha256sum1(rs1: u32) -> u32 { _sha256sum1(rs1 as i32) as u32 @@ -288,8 +282,7 @@ pub unsafe fn sha256sum1(rs1: u32) -> u32 { /// ``` #[target_feature(enable = "zksed")] #[rustc_legacy_const_generics(2)] -// See #1464 -// #[cfg_attr(test, assert_instr(sm4ed, BS = 0))] +#[cfg_attr(test, assert_instr(sm4ed, BS = 0))] #[inline] pub unsafe fn sm4ed(rs1: u32, rs2: u32) -> u32 { static_assert!(BS < 4); @@ -368,8 +361,7 @@ pub unsafe fn sm4ed(rs1: u32, rs2: u32) -> u32 { /// ``` #[target_feature(enable = "zksed")] #[rustc_legacy_const_generics(2)] -// See #1464 -// #[cfg_attr(test, assert_instr(sm4ks, BS = 0))] +#[cfg_attr(test, assert_instr(sm4ks, BS = 0))] #[inline] pub unsafe fn sm4ks(rs1: u32, rs2: u32) -> u32 { static_assert!(BS < 4); @@ -409,8 +401,7 @@ pub unsafe fn sm4ks(rs1: u32, rs2: u32) -> u32 { /// compression function `CF` uses the intermediate value `TT2` to calculate /// the variable `E` in one iteration for subsequent processes. #[target_feature(enable = "zksh")] -// See #1464 -// #[cfg_attr(test, assert_instr(sm3p0))] +#[cfg_attr(test, assert_instr(sm3p0))] #[inline] pub unsafe fn sm3p0(rs1: u32) -> u32 { _sm3p0(rs1 as i32) as u32 @@ -454,8 +445,7 @@ pub unsafe fn sm3p0(rs1: u32) -> u32 { /// ENDFOR /// ``` #[target_feature(enable = "zksh")] -// See #1464 -// #[cfg_attr(test, assert_instr(sm3p1))] +#[cfg_attr(test, assert_instr(sm3p1))] #[inline] pub unsafe fn sm3p1(rs1: u32) -> u32 { _sm3p1(rs1 as i32) as u32 diff --git a/library/stdarch/crates/core_arch/src/x86/avx.rs b/library/stdarch/crates/core_arch/src/x86/avx.rs index 00bcc1fa1..de5dc05b8 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx.rs @@ -268,7 +268,11 @@ pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 { #[cfg_attr(test, assert_instr(vaddsubpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d { - addsubpd256(a, b) + let a = a.as_f64x4(); + let b = b.as_f64x4(); + let add = simd_add(a, b); + let sub = simd_sub(a, b); + simd_shuffle!(add, sub, [4, 1, 6, 3]) } /// Alternatively adds and subtracts packed single-precision (32-bit) @@ -280,7 +284,11 @@ pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d { #[cfg_attr(test, assert_instr(vaddsubps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 { - addsubps256(a, b) + let a = a.as_f32x8(); + let b = b.as_f32x8(); + let add = simd_add(a, b); + let sub = simd_sub(a, b); + simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7]) } /// Subtracts packed double-precision (64-bit) floating-point elements in `b` @@ -511,7 +519,8 @@ pub unsafe fn _mm256_blend_ps(a: __m256, b: __m256) -> __m256 { #[cfg_attr(test, assert_instr(vblendvpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { - vblendvpd(a, b, c) + let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::splat(0)); + transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4())) } /// Blends packed single-precision (32-bit) floating-point elements from @@ -523,7 +532,8 @@ pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { #[cfg_attr(test, assert_instr(vblendvps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 { - vblendvps(a, b, c) + let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::splat(0)); + transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8())) } /// Conditionally multiplies the packed single-precision (32-bit) floating-point @@ -2056,7 +2066,10 @@ pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 { #[cfg_attr(test, assert_instr(vmovmskpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 { - movmskpd256(a) + // Propagate the highest bit to the rest, because simd_bitmask + // requires all-1 or all-0. + let mask: i64x4 = simd_lt(transmute(a), i64x4::splat(0)); + simd_bitmask::(mask).into() } /// Sets each bit of the returned mask based on the most significant bit of the @@ -2069,7 +2082,10 @@ pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 { #[cfg_attr(test, assert_instr(vmovmskps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 { - movmskps256(a) + // Propagate the highest bit to the rest, because simd_bitmask + // requires all-1 or all-0. + let mask: i32x8 = simd_lt(transmute(a), i32x8::splat(0)); + simd_bitmask::(mask).into() } /// Returns vector of type __m256d with all elements set to zero. @@ -2904,20 +2920,12 @@ pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 { // LLVM intrinsics used in the above functions #[allow(improper_ctypes)] extern "C" { - #[link_name = "llvm.x86.avx.addsub.pd.256"] - fn addsubpd256(a: __m256d, b: __m256d) -> __m256d; - #[link_name = "llvm.x86.avx.addsub.ps.256"] - fn addsubps256(a: __m256, b: __m256) -> __m256; #[link_name = "llvm.x86.avx.round.pd.256"] fn roundpd256(a: __m256d, b: i32) -> __m256d; #[link_name = "llvm.x86.avx.round.ps.256"] fn roundps256(a: __m256, b: i32) -> __m256; #[link_name = "llvm.x86.avx.sqrt.ps.256"] fn sqrtps256(a: __m256) -> __m256; - #[link_name = "llvm.x86.avx.blendv.pd.256"] - fn vblendvpd(a: __m256d, b: __m256d, c: __m256d) -> __m256d; - #[link_name = "llvm.x86.avx.blendv.ps.256"] - fn vblendvps(a: __m256, b: __m256, c: __m256) -> __m256; #[link_name = "llvm.x86.avx.dp.ps.256"] fn vdpps(a: __m256, b: __m256, imm8: i32) -> __m256; #[link_name = "llvm.x86.avx.hadd.pd.256"] @@ -3026,10 +3034,6 @@ extern "C" { fn vtestcps(a: __m128, b: __m128) -> i32; #[link_name = "llvm.x86.avx.vtestnzc.ps"] fn vtestnzcps(a: __m128, b: __m128) -> i32; - #[link_name = "llvm.x86.avx.movmsk.pd.256"] - fn movmskpd256(a: __m256d) -> i32; - #[link_name = "llvm.x86.avx.movmsk.ps.256"] - fn movmskps256(a: __m256) -> i32; #[link_name = "llvm.x86.avx.min.ps.256"] fn vminps(a: __m256, b: __m256) -> __m256; #[link_name = "llvm.x86.avx.max.ps.256"] diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs index e23c795ee..243a4cdab 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx2.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs @@ -344,7 +344,10 @@ pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpavgw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i { - transmute(pavgw(a.as_u16x16(), b.as_u16x16())) + let a = simd_cast::<_, u32x16>(a.as_u16x16()); + let b = simd_cast::<_, u32x16>(b.as_u16x16()); + let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1)); + transmute(simd_cast::<_, u16x16>(r)) } /// Averages packed unsigned 8-bit integers in `a` and `b`. @@ -355,7 +358,10 @@ pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpavgb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i { - transmute(pavgb(a.as_u8x32(), b.as_u8x32())) + let a = simd_cast::<_, u16x32>(a.as_u8x32()); + let b = simd_cast::<_, u16x32>(b.as_u8x32()); + let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1)); + transmute(simd_cast::<_, u8x32>(r)) } /// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`. @@ -458,7 +464,8 @@ pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i) -> __m #[cfg_attr(test, assert_instr(vpblendvb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i { - transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32())) + let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::splat(0)); + transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32())) } /// Broadcasts the low packed 8-bit integer from `a` to all elements of @@ -2060,7 +2067,9 @@ pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i) -> __ #[cfg_attr(test, assert_instr(vpmuldq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i { - transmute(pmuldq(a.as_i32x8(), b.as_i32x8())) + let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4())); + let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4())); + transmute(simd_mul(a, b)) } /// Multiplies the low unsigned 32-bit integers from each packed 64-bit @@ -2074,7 +2083,10 @@ pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpmuludq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i { - transmute(pmuludq(a.as_u32x8(), b.as_u32x8())) + let a = a.as_u64x4(); + let b = b.as_u64x4(); + let mask = u64x4::splat(u32::MAX.into()); + transmute(simd_mul(simd_and(a, mask), simd_and(b, mask))) } /// Multiplies the packed 16-bit integers in `a` and `b`, producing @@ -2087,7 +2099,10 @@ pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpmulhw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(pmulhw(a.as_i16x16(), b.as_i16x16())) + let a = simd_cast::<_, i32x16>(a.as_i16x16()); + let b = simd_cast::<_, i32x16>(b.as_i16x16()); + let r = simd_shr(simd_mul(a, b), i32x16::splat(16)); + transmute(simd_cast::(r)) } /// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing @@ -2100,7 +2115,10 @@ pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpmulhuw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i { - transmute(pmulhuw(a.as_u16x16(), b.as_u16x16())) + let a = simd_cast::<_, u32x16>(a.as_u16x16()); + let b = simd_cast::<_, u32x16>(b.as_u16x16()); + let r = simd_shr(simd_mul(a, b), u32x16::splat(16)); + transmute(simd_cast::(r)) } /// Multiplies the packed 16-bit integers in `a` and `b`, producing @@ -3629,12 +3647,6 @@ extern "C" { fn pabsw(a: i16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pabs.d"] fn pabsd(a: i32x8) -> u32x8; - #[link_name = "llvm.x86.avx2.pavg.b"] - fn pavgb(a: u8x32, b: u8x32) -> u8x32; - #[link_name = "llvm.x86.avx2.pavg.w"] - fn pavgw(a: u16x16, b: u16x16) -> u16x16; - #[link_name = "llvm.x86.avx2.pblendvb"] - fn pblendvb(a: i8x32, b: i8x32, mask: i8x32) -> i8x32; #[link_name = "llvm.x86.avx2.phadd.w"] fn phaddw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.phadd.d"] @@ -3669,14 +3681,6 @@ extern "C" { fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4); #[link_name = "llvm.x86.avx2.mpsadbw"] fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16; - #[link_name = "llvm.x86.avx2.pmulhu.w"] - fn pmulhuw(a: u16x16, b: u16x16) -> u16x16; - #[link_name = "llvm.x86.avx2.pmulh.w"] - fn pmulhw(a: i16x16, b: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx2.pmul.dq"] - fn pmuldq(a: i32x8, b: i32x8) -> i64x4; - #[link_name = "llvm.x86.avx2.pmulu.dq"] - fn pmuludq(a: u32x8, b: u32x8) -> u64x4; #[link_name = "llvm.x86.avx2.pmul.hr.sw"] fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.packsswb"] diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs b/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs index 92e572eb1..ce4e402a8 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs @@ -311,7 +311,7 @@ pub unsafe fn _mm_mask_popcnt_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __ #[target_feature(enable = "avx512bitalg")] #[cfg_attr(test, assert_instr(vpshufbitqmb))] pub unsafe fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64 { - transmute(bitshuffle_512(b.as_i8x64(), c.as_i8x64(), !0)) + bitshuffle_512(b.as_i8x64(), c.as_i8x64(), !0) } /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers. @@ -326,7 +326,7 @@ pub unsafe fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64 #[target_feature(enable = "avx512bitalg")] #[cfg_attr(test, assert_instr(vpshufbitqmb))] pub unsafe fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m512i) -> __mmask64 { - transmute(bitshuffle_512(b.as_i8x64(), c.as_i8x64(), k)) + bitshuffle_512(b.as_i8x64(), c.as_i8x64(), k) } /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers. @@ -338,7 +338,7 @@ pub unsafe fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m #[target_feature(enable = "avx512bitalg,avx512vl")] #[cfg_attr(test, assert_instr(vpshufbitqmb))] pub unsafe fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32 { - transmute(bitshuffle_256(b.as_i8x32(), c.as_i8x32(), !0)) + bitshuffle_256(b.as_i8x32(), c.as_i8x32(), !0) } /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers. @@ -353,7 +353,7 @@ pub unsafe fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32 #[target_feature(enable = "avx512bitalg,avx512vl")] #[cfg_attr(test, assert_instr(vpshufbitqmb))] pub unsafe fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m256i) -> __mmask32 { - transmute(bitshuffle_256(b.as_i8x32(), c.as_i8x32(), k)) + bitshuffle_256(b.as_i8x32(), c.as_i8x32(), k) } /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers. @@ -365,7 +365,7 @@ pub unsafe fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m #[target_feature(enable = "avx512bitalg,avx512vl")] #[cfg_attr(test, assert_instr(vpshufbitqmb))] pub unsafe fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 { - transmute(bitshuffle_128(b.as_i8x16(), c.as_i8x16(), !0)) + bitshuffle_128(b.as_i8x16(), c.as_i8x16(), !0) } /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers. @@ -380,7 +380,7 @@ pub unsafe fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 { #[target_feature(enable = "avx512bitalg,avx512vl")] #[cfg_attr(test, assert_instr(vpshufbitqmb))] pub unsafe fn _mm_mask_bitshuffle_epi64_mask(k: __mmask16, b: __m128i, c: __m128i) -> __mmask16 { - transmute(bitshuffle_128(b.as_i8x16(), c.as_i8x16(), k)) + bitshuffle_128(b.as_i8x16(), c.as_i8x16(), k) } #[cfg(test)] diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs index 364023539..0b4a56d36 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs @@ -3703,8 +3703,7 @@ pub unsafe fn _mm512_cmp_epu16_mask(a: __m512i, b: __m512i) -> static_assert_uimm_bits!(IMM8, 3); let a = a.as_u16x32(); let b = b.as_u16x32(); - let r = vpcmpuw(a, b, IMM8, 0b11111111_11111111_11111111_11111111); - transmute(r) + vpcmpuw(a, b, IMM8, 0b11111111_11111111_11111111_11111111) } /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -3722,8 +3721,7 @@ pub unsafe fn _mm512_mask_cmp_epu16_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_u16x32(); let b = b.as_u16x32(); - let r = vpcmpuw(a, b, IMM8, k1); - transmute(r) + vpcmpuw(a, b, IMM8, k1) } /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -3737,8 +3735,7 @@ pub unsafe fn _mm256_cmp_epu16_mask(a: __m256i, b: __m256i) -> static_assert_uimm_bits!(IMM8, 3); let a = a.as_u16x16(); let b = b.as_u16x16(); - let r = vpcmpuw256(a, b, IMM8, 0b11111111_11111111); - transmute(r) + vpcmpuw256(a, b, IMM8, 0b11111111_11111111) } /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -3756,8 +3753,7 @@ pub unsafe fn _mm256_mask_cmp_epu16_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_u16x16(); let b = b.as_u16x16(); - let r = vpcmpuw256(a, b, IMM8, k1); - transmute(r) + vpcmpuw256(a, b, IMM8, k1) } /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -3771,8 +3767,7 @@ pub unsafe fn _mm_cmp_epu16_mask(a: __m128i, b: __m128i) -> __m static_assert_uimm_bits!(IMM8, 3); let a = a.as_u16x8(); let b = b.as_u16x8(); - let r = vpcmpuw128(a, b, IMM8, 0b11111111); - transmute(r) + vpcmpuw128(a, b, IMM8, 0b11111111) } /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -3790,8 +3785,7 @@ pub unsafe fn _mm_mask_cmp_epu16_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_u16x8(); let b = b.as_u16x8(); - let r = vpcmpuw128(a, b, IMM8, k1); - transmute(r) + vpcmpuw128(a, b, IMM8, k1) } /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -3805,13 +3799,12 @@ pub unsafe fn _mm512_cmp_epu8_mask(a: __m512i, b: __m512i) -> _ static_assert_uimm_bits!(IMM8, 3); let a = a.as_u8x64(); let b = b.as_u8x64(); - let r = vpcmpub( + vpcmpub( a, b, IMM8, 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111, - ); - transmute(r) + ) } /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -3829,8 +3822,7 @@ pub unsafe fn _mm512_mask_cmp_epu8_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_u8x64(); let b = b.as_u8x64(); - let r = vpcmpub(a, b, IMM8, k1); - transmute(r) + vpcmpub(a, b, IMM8, k1) } /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -3844,8 +3836,7 @@ pub unsafe fn _mm256_cmp_epu8_mask(a: __m256i, b: __m256i) -> _ static_assert_uimm_bits!(IMM8, 3); let a = a.as_u8x32(); let b = b.as_u8x32(); - let r = vpcmpub256(a, b, IMM8, 0b11111111_11111111_11111111_11111111); - transmute(r) + vpcmpub256(a, b, IMM8, 0b11111111_11111111_11111111_11111111) } /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -3863,8 +3854,7 @@ pub unsafe fn _mm256_mask_cmp_epu8_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_u8x32(); let b = b.as_u8x32(); - let r = vpcmpub256(a, b, IMM8, k1); - transmute(r) + vpcmpub256(a, b, IMM8, k1) } /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -3878,8 +3868,7 @@ pub unsafe fn _mm_cmp_epu8_mask(a: __m128i, b: __m128i) -> __mm static_assert_uimm_bits!(IMM8, 3); let a = a.as_u8x16(); let b = b.as_u8x16(); - let r = vpcmpub128(a, b, IMM8, 0b11111111_11111111); - transmute(r) + vpcmpub128(a, b, IMM8, 0b11111111_11111111) } /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -3897,8 +3886,7 @@ pub unsafe fn _mm_mask_cmp_epu8_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_u8x16(); let b = b.as_u8x16(); - let r = vpcmpub128(a, b, IMM8, k1); - transmute(r) + vpcmpub128(a, b, IMM8, k1) } /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -3912,8 +3900,7 @@ pub unsafe fn _mm512_cmp_epi16_mask(a: __m512i, b: __m512i) -> static_assert_uimm_bits!(IMM8, 3); let a = a.as_i16x32(); let b = b.as_i16x32(); - let r = vpcmpw(a, b, IMM8, 0b11111111_11111111_11111111_11111111); - transmute(r) + vpcmpw(a, b, IMM8, 0b11111111_11111111_11111111_11111111) } /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -3931,8 +3918,7 @@ pub unsafe fn _mm512_mask_cmp_epi16_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_i16x32(); let b = b.as_i16x32(); - let r = vpcmpw(a, b, IMM8, k1); - transmute(r) + vpcmpw(a, b, IMM8, k1) } /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -3946,8 +3932,7 @@ pub unsafe fn _mm256_cmp_epi16_mask(a: __m256i, b: __m256i) -> static_assert_uimm_bits!(IMM8, 3); let a = a.as_i16x16(); let b = b.as_i16x16(); - let r = vpcmpw256(a, b, IMM8, 0b11111111_11111111); - transmute(r) + vpcmpw256(a, b, IMM8, 0b11111111_11111111) } /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -3965,8 +3950,7 @@ pub unsafe fn _mm256_mask_cmp_epi16_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_i16x16(); let b = b.as_i16x16(); - let r = vpcmpw256(a, b, IMM8, k1); - transmute(r) + vpcmpw256(a, b, IMM8, k1) } /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -3980,8 +3964,7 @@ pub unsafe fn _mm_cmp_epi16_mask(a: __m128i, b: __m128i) -> __m static_assert_uimm_bits!(IMM8, 3); let a = a.as_i16x8(); let b = b.as_i16x8(); - let r = vpcmpw128(a, b, IMM8, 0b11111111); - transmute(r) + vpcmpw128(a, b, IMM8, 0b11111111) } /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -3999,8 +3982,7 @@ pub unsafe fn _mm_mask_cmp_epi16_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_i16x8(); let b = b.as_i16x8(); - let r = vpcmpw128(a, b, IMM8, k1); - transmute(r) + vpcmpw128(a, b, IMM8, k1) } /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -4014,13 +3996,12 @@ pub unsafe fn _mm512_cmp_epi8_mask(a: __m512i, b: __m512i) -> _ static_assert_uimm_bits!(IMM8, 3); let a = a.as_i8x64(); let b = b.as_i8x64(); - let r = vpcmpb( + vpcmpb( a, b, IMM8, 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111, - ); - transmute(r) + ) } /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -4038,8 +4019,7 @@ pub unsafe fn _mm512_mask_cmp_epi8_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_i8x64(); let b = b.as_i8x64(); - let r = vpcmpb(a, b, IMM8, k1); - transmute(r) + vpcmpb(a, b, IMM8, k1) } /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -4053,8 +4033,7 @@ pub unsafe fn _mm256_cmp_epi8_mask(a: __m256i, b: __m256i) -> _ static_assert_uimm_bits!(IMM8, 3); let a = a.as_i8x32(); let b = b.as_i8x32(); - let r = vpcmpb256(a, b, IMM8, 0b11111111_11111111_11111111_11111111); - transmute(r) + vpcmpb256(a, b, IMM8, 0b11111111_11111111_11111111_11111111) } /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -4072,8 +4051,7 @@ pub unsafe fn _mm256_mask_cmp_epi8_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_i8x32(); let b = b.as_i8x32(); - let r = vpcmpb256(a, b, IMM8, k1); - transmute(r) + vpcmpb256(a, b, IMM8, k1) } /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -4087,8 +4065,7 @@ pub unsafe fn _mm_cmp_epi8_mask(a: __m128i, b: __m128i) -> __mm static_assert_uimm_bits!(IMM8, 3); let a = a.as_i8x16(); let b = b.as_i8x16(); - let r = vpcmpb128(a, b, IMM8, 0b11111111_11111111); - transmute(r) + vpcmpb128(a, b, IMM8, 0b11111111_11111111) } /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -4106,8 +4083,7 @@ pub unsafe fn _mm_mask_cmp_epi8_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_i8x16(); let b = b.as_i8x16(); - let r = vpcmpb128(a, b, IMM8, k1); - transmute(r) + vpcmpb128(a, b, IMM8, k1) } /// Load 512-bits (composed of 32 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary. @@ -8566,7 +8542,7 @@ pub unsafe fn _mm_movm_epi8(k: __mmask16) -> __m128i { #[inline] #[target_feature(enable = "avx512bw")] pub unsafe fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { - transmute(a + b) + a + b } /// Add 64-bit masks in a and b, and store the result in k. @@ -8575,7 +8551,7 @@ pub unsafe fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { #[inline] #[target_feature(enable = "avx512bw")] pub unsafe fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { - transmute(a + b) + a + b } /// Compute the bitwise AND of 32-bit masks a and b, and store the result in k. @@ -8584,7 +8560,7 @@ pub unsafe fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { #[inline] #[target_feature(enable = "avx512bw")] pub unsafe fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { - transmute(a & b) + a & b } /// Compute the bitwise AND of 64-bit masks a and b, and store the result in k. @@ -8593,7 +8569,7 @@ pub unsafe fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { #[inline] #[target_feature(enable = "avx512bw")] pub unsafe fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { - transmute(a & b) + a & b } /// Compute the bitwise NOT of 32-bit mask a, and store the result in k. @@ -8602,7 +8578,7 @@ pub unsafe fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { #[inline] #[target_feature(enable = "avx512bw")] pub unsafe fn _knot_mask32(a: __mmask32) -> __mmask32 { - transmute(a ^ 0b11111111_11111111_11111111_11111111) + a ^ 0b11111111_11111111_11111111_11111111 } /// Compute the bitwise NOT of 64-bit mask a, and store the result in k. @@ -8611,7 +8587,7 @@ pub unsafe fn _knot_mask32(a: __mmask32) -> __mmask32 { #[inline] #[target_feature(enable = "avx512bw")] pub unsafe fn _knot_mask64(a: __mmask64) -> __mmask64 { - transmute(a ^ 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111) + a ^ 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111 } /// Compute the bitwise NOT of 32-bit masks a and then AND with b, and store the result in k. @@ -8620,7 +8596,7 @@ pub unsafe fn _knot_mask64(a: __mmask64) -> __mmask64 { #[inline] #[target_feature(enable = "avx512bw")] pub unsafe fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { - transmute(_knot_mask32(a) & b) + _knot_mask32(a) & b } /// Compute the bitwise NOT of 64-bit masks a and then AND with b, and store the result in k. @@ -8629,7 +8605,7 @@ pub unsafe fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { #[inline] #[target_feature(enable = "avx512bw")] pub unsafe fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { - transmute(_knot_mask64(a) & b) + _knot_mask64(a) & b } /// Compute the bitwise OR of 32-bit masks a and b, and store the result in k. @@ -8638,7 +8614,7 @@ pub unsafe fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { #[inline] #[target_feature(enable = "avx512bw")] pub unsafe fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { - transmute(a | b) + a | b } /// Compute the bitwise OR of 64-bit masks a and b, and store the result in k. @@ -8647,7 +8623,7 @@ pub unsafe fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { #[inline] #[target_feature(enable = "avx512bw")] pub unsafe fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { - transmute(a | b) + a | b } /// Compute the bitwise XOR of 32-bit masks a and b, and store the result in k. @@ -8656,7 +8632,7 @@ pub unsafe fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { #[inline] #[target_feature(enable = "avx512bw")] pub unsafe fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { - transmute(a ^ b) + a ^ b } /// Compute the bitwise XOR of 64-bit masks a and b, and store the result in k. @@ -8665,7 +8641,7 @@ pub unsafe fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { #[inline] #[target_feature(enable = "avx512bw")] pub unsafe fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { - transmute(a ^ b) + a ^ b } /// Compute the bitwise XNOR of 32-bit masks a and b, and store the result in k. @@ -8674,7 +8650,7 @@ pub unsafe fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { #[inline] #[target_feature(enable = "avx512bw")] pub unsafe fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { - transmute(_knot_mask32(a ^ b)) + _knot_mask32(a ^ b) } /// Compute the bitwise XNOR of 64-bit masks a and b, and store the result in k. @@ -8683,7 +8659,7 @@ pub unsafe fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { #[inline] #[target_feature(enable = "avx512bw")] pub unsafe fn _kxnor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { - transmute(_knot_mask64(a ^ b)) + _knot_mask64(a ^ b) } /// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst. diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs index 5412237ca..280135292 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs @@ -17144,7 +17144,7 @@ pub unsafe fn _mm512_slli_epi32(a: __m512i) -> __m512i { if IMM8 >= 32 { _mm512_setzero_si512() } else { - transmute(simd_shl(a.as_u32x16(), u32x16::splat(IMM8 as u32))) + transmute(simd_shl(a.as_u32x16(), u32x16::splat(IMM8))) } } @@ -20132,7 +20132,7 @@ pub unsafe fn _mm512_maskz_permutexvar_epi32(k: __mmask16, idx: __m512i, a: __m5 #[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vperm))] //should be vpermd pub unsafe fn _mm256_permutexvar_epi32(idx: __m256i, a: __m256i) -> __m256i { - transmute(_mm256_permutevar8x32_epi32(a, idx)) // llvm use llvm.x86.avx2.permd + _mm256_permutevar8x32_epi32(a, idx) // llvm use llvm.x86.avx2.permd } /// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -20284,7 +20284,7 @@ pub unsafe fn _mm512_maskz_permutexvar_ps(k: __mmask16, idx: __m512i, a: __m512) #[target_feature(enable = "avx512f,avx512vl")] #[cfg_attr(test, assert_instr(vpermps))] pub unsafe fn _mm256_permutexvar_ps(idx: __m256i, a: __m256) -> __m256 { - transmute(_mm256_permutevar8x32_ps(a, idx)) //llvm.x86.avx2.permps + _mm256_permutevar8x32_ps(a, idx) //llvm.x86.avx2.permps } /// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -23943,7 +23943,7 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d { #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(vmovd))] pub unsafe fn _mm512_cvtsi512_si32(a: __m512i) -> i32 { let extract: i32 = simd_extract(a.as_i32x16(), 0); - transmute(extract) + extract } /// Broadcast the low packed 32-bit integer from a to all elements of dst. @@ -25744,7 +25744,7 @@ pub unsafe fn _mm512_andnot_si512(a: __m512i, b: __m512i) -> __m512i { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { - transmute(a & b) + a & b } /// Compute the bitwise AND of 16-bit masks a and b, and store the result in k. @@ -25754,7 +25754,7 @@ pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 { - transmute(a & b) + a & b } /// Compute the bitwise OR of 16-bit masks a and b, and store the result in k. @@ -25764,7 +25764,7 @@ pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { - transmute(a | b) + a | b } /// Compute the bitwise OR of 16-bit masks a and b, and store the result in k. @@ -25774,7 +25774,7 @@ pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 { - transmute(a | b) + a | b } /// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k. @@ -25784,7 +25784,7 @@ pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { - transmute(a ^ b) + a ^ b } /// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k. @@ -25794,7 +25794,7 @@ pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw pub unsafe fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 { - transmute(a ^ b) + a ^ b } /// Compute the bitwise NOT of 16-bit mask a, and store the result in k. @@ -25803,7 +25803,7 @@ pub unsafe fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 { #[inline] #[target_feature(enable = "avx512f")] pub unsafe fn _knot_mask16(a: __mmask16) -> __mmask16 { - transmute(a ^ 0b11111111_11111111) + a ^ 0b11111111_11111111 } /// Compute the bitwise NOT of 16-bit mask a, and store the result in k. @@ -25812,7 +25812,7 @@ pub unsafe fn _knot_mask16(a: __mmask16) -> __mmask16 { #[inline] #[target_feature(enable = "avx512f")] pub unsafe fn _mm512_knot(a: __mmask16) -> __mmask16 { - transmute(a ^ 0b11111111_11111111) + a ^ 0b11111111_11111111 } /// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k. @@ -25862,8 +25862,7 @@ pub unsafe fn _mm512_kxnor(a: __mmask16, b: __mmask16) -> __mmask16 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 { - let r: u16 = a; - transmute(r) + a } /// Converts integer mask into bitmask, storing the result in dst. @@ -25872,8 +25871,7 @@ pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 { #[inline] #[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 { - let r: u16 = mask as u16; - transmute(r) + mask as u16 } /// Converts bit mask k1 into an integer value, storing the results in dst. @@ -25883,8 +25881,7 @@ pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 { - let r: i32 = k1 as i32; - transmute(r) + k1 as i32 } /// Unpack and interleave 8 bits from masks a and b, and store the 16-bit result in k. @@ -25896,7 +25893,7 @@ pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 { pub unsafe fn _mm512_kunpackb(a: __mmask16, b: __mmask16) -> __mmask16 { let a = a & 0b00000000_11111111; let b = b & 0b11111111_00000000; - transmute(a | b) + a | b } /// Performs bitwise OR between k1 and k2, storing the result in dst. CF flag is set if dst consists of all 1's. @@ -32352,8 +32349,7 @@ pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) - if (k & 0b00000001) != 0 { mov = simd_extract(b, 0); } - let r = simd_insert(a, 0, mov); - transmute(r) + simd_insert(a, 0, mov) } /// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. @@ -32367,8 +32363,7 @@ pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { if (k & 0b00000001) != 0 { mov = simd_extract(b, 0); } - let r = simd_insert(a, 0, mov); - transmute(r) + simd_insert(a, 0, mov) } /// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -32383,8 +32378,7 @@ pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d if (k & 0b00000001) != 0 { mov = simd_extract(b, 0); } - let r = simd_insert(a, 0, mov); - transmute(r) + simd_insert(a, 0, mov) } /// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -32398,8 +32392,7 @@ pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d if (k & 0b00000001) != 0 { mov = simd_extract(b, 0); } - let r = simd_insert(a, 0, mov); - transmute(r) + simd_insert(a, 0, mov) } /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. @@ -32416,8 +32409,7 @@ pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> let extractb: f32 = simd_extract(b, 0); add = extracta + extractb; } - let r = simd_insert(a, 0, add); - transmute(r) + simd_insert(a, 0, add) } /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. @@ -32433,8 +32425,7 @@ pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { let extractb: f32 = simd_extract(b, 0); add = extracta + extractb; } - let r = simd_insert(a, 0, add); - transmute(r) + simd_insert(a, 0, add) } /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -32451,8 +32442,7 @@ pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) let extractb: f64 = simd_extract(b, 0); add = extracta + extractb; } - let r = simd_insert(a, 0, add); - transmute(r) + simd_insert(a, 0, add) } /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -32468,8 +32458,7 @@ pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { let extractb: f64 = simd_extract(b, 0); add = extracta + extractb; } - let r = simd_insert(a, 0, add); - transmute(r) + simd_insert(a, 0, add) } /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. @@ -32486,8 +32475,7 @@ pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> let extractb: f32 = simd_extract(b, 0); add = extracta - extractb; } - let r = simd_insert(a, 0, add); - transmute(r) + simd_insert(a, 0, add) } /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. @@ -32503,8 +32491,7 @@ pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { let extractb: f32 = simd_extract(b, 0); add = extracta - extractb; } - let r = simd_insert(a, 0, add); - transmute(r) + simd_insert(a, 0, add) } /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -32521,8 +32508,7 @@ pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) let extractb: f64 = simd_extract(b, 0); add = extracta - extractb; } - let r = simd_insert(a, 0, add); - transmute(r) + simd_insert(a, 0, add) } /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -32538,8 +32524,7 @@ pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { let extractb: f64 = simd_extract(b, 0); add = extracta - extractb; } - let r = simd_insert(a, 0, add); - transmute(r) + simd_insert(a, 0, add) } /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. @@ -32556,8 +32541,7 @@ pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> let extractb: f32 = simd_extract(b, 0); add = extracta * extractb; } - let r = simd_insert(a, 0, add); - transmute(r) + simd_insert(a, 0, add) } /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. @@ -32573,8 +32557,7 @@ pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { let extractb: f32 = simd_extract(b, 0); add = extracta * extractb; } - let r = simd_insert(a, 0, add); - transmute(r) + simd_insert(a, 0, add) } /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -32591,8 +32574,7 @@ pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) let extractb: f64 = simd_extract(b, 0); add = extracta * extractb; } - let r = simd_insert(a, 0, add); - transmute(r) + simd_insert(a, 0, add) } /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -32608,8 +32590,7 @@ pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { let extractb: f64 = simd_extract(b, 0); add = extracta * extractb; } - let r = simd_insert(a, 0, add); - transmute(r) + simd_insert(a, 0, add) } /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. @@ -32626,8 +32607,7 @@ pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> let extractb: f32 = simd_extract(b, 0); add = extracta / extractb; } - let r = simd_insert(a, 0, add); - transmute(r) + simd_insert(a, 0, add) } /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. @@ -32643,8 +32623,7 @@ pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { let extractb: f32 = simd_extract(b, 0); add = extracta / extractb; } - let r = simd_insert(a, 0, add); - transmute(r) + simd_insert(a, 0, add) } /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -32661,8 +32640,7 @@ pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) let extractb: f64 = simd_extract(b, 0); add = extracta / extractb; } - let r = simd_insert(a, 0, add); - transmute(r) + simd_insert(a, 0, add) } /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -32678,8 +32656,7 @@ pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { let extractb: f64 = simd_extract(b, 0); add = extracta / extractb; } - let r = simd_insert(a, 0, add); - transmute(r) + simd_insert(a, 0, add) } /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. @@ -33587,8 +33564,7 @@ pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> let extractc: f32 = simd_extract(c, 0); fmadd = vfmadd132ss(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(a, 0, fmadd); - transmute(r) + simd_insert(a, 0, fmadd) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. @@ -33605,8 +33581,7 @@ pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) - let extractc: f32 = simd_extract(c, 0); fmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(a, 0, fmadd); - transmute(r) + simd_insert(a, 0, fmadd) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst. @@ -33622,8 +33597,7 @@ pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) - let extractb: f32 = simd_extract(b, 0); fmadd = vfmadd132ss(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(c, 0, fmadd); - transmute(r) + simd_insert(c, 0, fmadd) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -33639,8 +33613,7 @@ pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) let extractc: f64 = simd_extract(c, 0); fmadd = vfmadd132sd(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(a, 0, fmadd); - transmute(r) + simd_insert(a, 0, fmadd) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -33657,8 +33630,7 @@ pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d let extractc: f64 = simd_extract(c, 0); fmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(a, 0, fmadd); - transmute(r) + simd_insert(a, 0, fmadd) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst. @@ -33674,8 +33646,7 @@ pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8 let extractb: f64 = simd_extract(b, 0); fmadd = vfmadd132sd(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(c, 0, fmadd); - transmute(r) + simd_insert(c, 0, fmadd) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. @@ -33692,8 +33663,7 @@ pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> let extractc = -extractc; fmsub = vfmadd132ss(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(a, 0, fmsub); - transmute(r) + simd_insert(a, 0, fmsub) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. @@ -33711,8 +33681,7 @@ pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) - let extractc = -extractc; fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(a, 0, fmsub); - transmute(r) + simd_insert(a, 0, fmsub) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst. @@ -33729,8 +33698,7 @@ pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) - let extractc = -fmsub; fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(c, 0, fmsub); - transmute(r) + simd_insert(c, 0, fmsub) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -33747,8 +33715,7 @@ pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) let extractc = -extractc; fmsub = vfmadd132sd(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(a, 0, fmsub); - transmute(r) + simd_insert(a, 0, fmsub) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -33766,8 +33733,7 @@ pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d let extractc = -extractc; fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(a, 0, fmsub); - transmute(r) + simd_insert(a, 0, fmsub) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst. @@ -33784,8 +33750,7 @@ pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8 let extractc = -fmsub; fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(c, 0, fmsub); - transmute(r) + simd_insert(c, 0, fmsub) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. @@ -33802,8 +33767,7 @@ pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) - let extractc: f32 = simd_extract(c, 0); fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(a, 0, fnmadd); - transmute(r) + simd_insert(a, 0, fnmadd) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. @@ -33821,8 +33785,7 @@ pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) let extractc: f32 = simd_extract(c, 0); fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(a, 0, fnmadd); - transmute(r) + simd_insert(a, 0, fnmadd) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst. @@ -33839,8 +33802,7 @@ pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) let extractb: f32 = simd_extract(b, 0); fnmadd = vfmadd132ss(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(c, 0, fnmadd); - transmute(r) + simd_insert(c, 0, fnmadd) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -33857,8 +33819,7 @@ pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d let extractc: f64 = simd_extract(c, 0); fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(a, 0, fnmadd); - transmute(r) + simd_insert(a, 0, fnmadd) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -33876,8 +33837,7 @@ pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128 let extractc: f64 = simd_extract(c, 0); fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(a, 0, fnmadd); - transmute(r) + simd_insert(a, 0, fnmadd) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst. @@ -33894,8 +33854,7 @@ pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask let extractb: f64 = simd_extract(b, 0); fnmadd = vfmadd132sd(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(c, 0, fnmadd); - transmute(r) + simd_insert(c, 0, fnmadd) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. @@ -33913,8 +33872,7 @@ pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) - let extractc = -extractc; fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(a, 0, fnmsub); - transmute(r) + simd_insert(a, 0, fnmsub) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. @@ -33933,8 +33891,7 @@ pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) let extractc = -extractc; fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(a, 0, fnmsub); - transmute(r) + simd_insert(a, 0, fnmsub) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst. @@ -33952,8 +33909,7 @@ pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) let extractc = -fnmsub; fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(c, 0, fnmsub); - transmute(r) + simd_insert(c, 0, fnmsub) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -33971,8 +33927,7 @@ pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d let extractc = -extractc; fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(a, 0, fnmsub); - transmute(r) + simd_insert(a, 0, fnmsub) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. @@ -33991,8 +33946,7 @@ pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128 let extractc = -extractc; fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(a, 0, fnmsub); - transmute(r) + simd_insert(a, 0, fnmsub) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst. @@ -34010,8 +33964,7 @@ pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask let extractc = -fnmsub; fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); } - let r = simd_insert(c, 0, fnmsub); - transmute(r) + simd_insert(c, 0, fnmsub) } /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ @@ -35705,8 +35658,7 @@ pub unsafe fn _mm_fmadd_round_ss(a: __m128, b: __m128, c: _ let extractb: f32 = simd_extract(b, 0); let extractc: f32 = simd_extract(c, 0); let r = vfmadd132ss(extracta, extractb, extractc, ROUNDING); - let r = simd_insert(a, 0, r); - transmute(r) + simd_insert(a, 0, r) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ @@ -35736,8 +35688,7 @@ pub unsafe fn _mm_mask_fmadd_round_ss( let extractc: f32 = simd_extract(c, 0); fmadd = vfmadd132ss(fmadd, extractb, extractc, ROUNDING); } - let r = simd_insert(a, 0, fmadd); - transmute(r) + simd_insert(a, 0, fmadd) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ @@ -35768,8 +35719,7 @@ pub unsafe fn _mm_maskz_fmadd_round_ss( let extractc: f32 = simd_extract(c, 0); fmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING); } - let r = simd_insert(a, 0, fmadd); - transmute(r) + simd_insert(a, 0, fmadd) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\ @@ -35799,8 +35749,7 @@ pub unsafe fn _mm_mask3_fmadd_round_ss( let extractb: f32 = simd_extract(b, 0); fmadd = vfmadd132ss(extracta, extractb, fmadd, ROUNDING); } - let r = simd_insert(c, 0, fmadd); - transmute(r) + simd_insert(c, 0, fmadd) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\ @@ -35827,8 +35776,7 @@ pub unsafe fn _mm_fmadd_round_sd( let extractb: f64 = simd_extract(b, 0); let extractc: f64 = simd_extract(c, 0); let fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING); - let r = simd_insert(a, 0, fmadd); - transmute(r) + simd_insert(a, 0, fmadd) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ @@ -35858,8 +35806,7 @@ pub unsafe fn _mm_mask_fmadd_round_sd( let extractc: f64 = simd_extract(c, 0); fmadd = vfmadd132sd(fmadd, extractb, extractc, ROUNDING); } - let r = simd_insert(a, 0, fmadd); - transmute(r) + simd_insert(a, 0, fmadd) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ @@ -35890,8 +35837,7 @@ pub unsafe fn _mm_maskz_fmadd_round_sd( let extractc: f64 = simd_extract(c, 0); fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING); } - let r = simd_insert(a, 0, fmadd); - transmute(r) + simd_insert(a, 0, fmadd) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\ @@ -35921,8 +35867,7 @@ pub unsafe fn _mm_mask3_fmadd_round_sd( let extractb: f64 = simd_extract(b, 0); fmadd = vfmadd132sd(extracta, extractb, fmadd, ROUNDING); } - let r = simd_insert(c, 0, fmadd); - transmute(r) + simd_insert(c, 0, fmadd) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ @@ -35946,8 +35891,7 @@ pub unsafe fn _mm_fmsub_round_ss(a: __m128, b: __m128, c: _ let extractc: f32 = simd_extract(c, 0); let extractc = -extractc; let fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING); - let r = simd_insert(a, 0, fmsub); - transmute(r) + simd_insert(a, 0, fmsub) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ @@ -35978,8 +35922,7 @@ pub unsafe fn _mm_mask_fmsub_round_ss( let extractc = -extractc; fmsub = vfmadd132ss(fmsub, extractb, extractc, ROUNDING); } - let r = simd_insert(a, 0, fmsub); - transmute(r) + simd_insert(a, 0, fmsub) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ @@ -36011,8 +35954,7 @@ pub unsafe fn _mm_maskz_fmsub_round_ss( let extractc = -extractc; fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING); } - let r = simd_insert(a, 0, fmsub); - transmute(r) + simd_insert(a, 0, fmsub) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\ @@ -36043,8 +35985,7 @@ pub unsafe fn _mm_mask3_fmsub_round_ss( let extractc = -fmsub; fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING); } - let r = simd_insert(c, 0, fmsub); - transmute(r) + simd_insert(c, 0, fmsub) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\ @@ -36072,8 +36013,7 @@ pub unsafe fn _mm_fmsub_round_sd( let extractc: f64 = simd_extract(c, 0); let extractc = -extractc; let fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING); - let r = simd_insert(a, 0, fmsub); - transmute(r) + simd_insert(a, 0, fmsub) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ @@ -36104,8 +36044,7 @@ pub unsafe fn _mm_mask_fmsub_round_sd( let extractc = -extractc; fmsub = vfmadd132sd(fmsub, extractb, extractc, ROUNDING); } - let r = simd_insert(a, 0, fmsub); - transmute(r) + simd_insert(a, 0, fmsub) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ @@ -36137,8 +36076,7 @@ pub unsafe fn _mm_maskz_fmsub_round_sd( let extractc = -extractc; fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING); } - let r = simd_insert(a, 0, fmsub); - transmute(r) + simd_insert(a, 0, fmsub) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\ @@ -36169,8 +36107,7 @@ pub unsafe fn _mm_mask3_fmsub_round_sd( let extractc = -fmsub; fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING); } - let r = simd_insert(c, 0, fmsub); - transmute(r) + simd_insert(c, 0, fmsub) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ @@ -36194,8 +36131,7 @@ pub unsafe fn _mm_fnmadd_round_ss(a: __m128, b: __m128, c: let extractb: f32 = simd_extract(b, 0); let extractc: f32 = simd_extract(c, 0); let fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING); - let r = simd_insert(a, 0, fnmadd); - transmute(r) + simd_insert(a, 0, fnmadd) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ @@ -36226,8 +36162,7 @@ pub unsafe fn _mm_mask_fnmadd_round_ss( let extractc: f32 = simd_extract(c, 0); fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING); } - let r = simd_insert(a, 0, fnmadd); - transmute(r) + simd_insert(a, 0, fnmadd) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ @@ -36259,8 +36194,7 @@ pub unsafe fn _mm_maskz_fnmadd_round_ss( let extractc: f32 = simd_extract(c, 0); fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING); } - let r = simd_insert(a, 0, fnmadd); - transmute(r) + simd_insert(a, 0, fnmadd) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\ @@ -36291,8 +36225,7 @@ pub unsafe fn _mm_mask3_fnmadd_round_ss( let extractb: f32 = simd_extract(b, 0); fnmadd = vfmadd132ss(extracta, extractb, fnmadd, ROUNDING); } - let r = simd_insert(c, 0, fnmadd); - transmute(r) + simd_insert(c, 0, fnmadd) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\ @@ -36320,8 +36253,7 @@ pub unsafe fn _mm_fnmadd_round_sd( let extractb: f64 = simd_extract(b, 0); let extractc: f64 = simd_extract(c, 0); let fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING); - let r = simd_insert(a, 0, fnmadd); - transmute(r) + simd_insert(a, 0, fnmadd) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ @@ -36352,8 +36284,7 @@ pub unsafe fn _mm_mask_fnmadd_round_sd( let extractc: f64 = simd_extract(c, 0); fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING); } - let r = simd_insert(a, 0, fnmadd); - transmute(r) + simd_insert(a, 0, fnmadd) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ @@ -36385,8 +36316,7 @@ pub unsafe fn _mm_maskz_fnmadd_round_sd( let extractc: f64 = simd_extract(c, 0); fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING); } - let r = simd_insert(a, 0, fnmadd); - transmute(r) + simd_insert(a, 0, fnmadd) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\ @@ -36417,8 +36347,7 @@ pub unsafe fn _mm_mask3_fnmadd_round_sd( let extractb: f64 = simd_extract(b, 0); fnmadd = vfmadd132sd(extracta, extractb, fnmadd, ROUNDING); } - let r = simd_insert(c, 0, fnmadd); - transmute(r) + simd_insert(c, 0, fnmadd) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ @@ -36443,8 +36372,7 @@ pub unsafe fn _mm_fnmsub_round_ss(a: __m128, b: __m128, c: let extractc: f32 = simd_extract(c, 0); let extractc = -extractc; let fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING); - let r = simd_insert(a, 0, fnmsub); - transmute(r) + simd_insert(a, 0, fnmsub) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ @@ -36476,8 +36404,7 @@ pub unsafe fn _mm_mask_fnmsub_round_ss( let extractc = -extractc; fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING); } - let r = simd_insert(a, 0, fnmsub); - transmute(r) + simd_insert(a, 0, fnmsub) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\ @@ -36510,8 +36437,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_ss( let extractc = -extractc; fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING); } - let r = simd_insert(a, 0, fnmsub); - transmute(r) + simd_insert(a, 0, fnmsub) } /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\ @@ -36543,8 +36469,7 @@ pub unsafe fn _mm_mask3_fnmsub_round_ss( let extractc = -fnmsub; fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING); } - let r = simd_insert(c, 0, fnmsub); - transmute(r) + simd_insert(c, 0, fnmsub) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\ @@ -36573,8 +36498,7 @@ pub unsafe fn _mm_fnmsub_round_sd( let extractc: f64 = simd_extract(c, 0); let extractc = -extractc; let fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING); - let r = simd_insert(a, 0, fnmsub); - transmute(r) + simd_insert(a, 0, fnmsub) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ @@ -36606,8 +36530,7 @@ pub unsafe fn _mm_mask_fnmsub_round_sd( let extractc = -extractc; fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING); } - let r = simd_insert(a, 0, fnmsub); - transmute(r) + simd_insert(a, 0, fnmsub) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\ @@ -36640,8 +36563,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_sd( let extractc = -extractc; fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING); } - let r = simd_insert(a, 0, fnmsub); - transmute(r) + simd_insert(a, 0, fnmsub) } /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\ @@ -36673,8 +36595,7 @@ pub unsafe fn _mm_mask3_fnmsub_round_sd( let extractc = -fnmsub; fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING); } - let r = simd_insert(c, 0, fnmsub); - transmute(r) + simd_insert(c, 0, fnmsub) } /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting. @@ -37168,8 +37089,7 @@ pub unsafe fn _mm_maskz_cvt_roundsd_ss( pub unsafe fn _mm_cvt_roundss_si32(a: __m128) -> i32 { static_assert_rounding!(ROUNDING); let a = a.as_f32x4(); - let r = vcvtss2si(a, ROUNDING); - transmute(r) + vcvtss2si(a, ROUNDING) } /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\ @@ -37188,8 +37108,7 @@ pub unsafe fn _mm_cvt_roundss_si32(a: __m128) -> i32 { pub unsafe fn _mm_cvt_roundss_i32(a: __m128) -> i32 { static_assert_rounding!(ROUNDING); let a = a.as_f32x4(); - let r = vcvtss2si(a, ROUNDING); - transmute(r) + vcvtss2si(a, ROUNDING) } /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\ @@ -37208,8 +37127,7 @@ pub unsafe fn _mm_cvt_roundss_i32(a: __m128) -> i32 { pub unsafe fn _mm_cvt_roundss_u32(a: __m128) -> u32 { static_assert_rounding!(ROUNDING); let a = a.as_f32x4(); - let r = vcvtss2usi(a, ROUNDING); - transmute(r) + vcvtss2usi(a, ROUNDING) } /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst. @@ -37219,7 +37137,7 @@ pub unsafe fn _mm_cvt_roundss_u32(a: __m128) -> u32 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vcvtss2si))] pub unsafe fn _mm_cvtss_i32(a: __m128) -> i32 { - transmute(vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)) + vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) } /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst. @@ -37229,7 +37147,7 @@ pub unsafe fn _mm_cvtss_i32(a: __m128) -> i32 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vcvtss2usi))] pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 { - transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)) + vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) } /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\ @@ -37248,8 +37166,7 @@ pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 { pub unsafe fn _mm_cvt_roundsd_si32(a: __m128d) -> i32 { static_assert_rounding!(ROUNDING); let a = a.as_f64x2(); - let r = vcvtsd2si(a, ROUNDING); - transmute(r) + vcvtsd2si(a, ROUNDING) } /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\ @@ -37268,8 +37185,7 @@ pub unsafe fn _mm_cvt_roundsd_si32(a: __m128d) -> i32 { pub unsafe fn _mm_cvt_roundsd_i32(a: __m128d) -> i32 { static_assert_rounding!(ROUNDING); let a = a.as_f64x2(); - let r = vcvtsd2si(a, ROUNDING); - transmute(r) + vcvtsd2si(a, ROUNDING) } /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\ @@ -37288,8 +37204,7 @@ pub unsafe fn _mm_cvt_roundsd_i32(a: __m128d) -> i32 { pub unsafe fn _mm_cvt_roundsd_u32(a: __m128d) -> u32 { static_assert_rounding!(ROUNDING); let a = a.as_f64x2(); - let r = vcvtsd2usi(a, ROUNDING); - transmute(r) + vcvtsd2usi(a, ROUNDING) } /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst. @@ -37299,7 +37214,7 @@ pub unsafe fn _mm_cvt_roundsd_u32(a: __m128d) -> u32 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vcvtsd2si))] pub unsafe fn _mm_cvtsd_i32(a: __m128d) -> i32 { - transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)) + vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) } /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst. @@ -37309,7 +37224,7 @@ pub unsafe fn _mm_cvtsd_i32(a: __m128d) -> i32 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vcvtsd2usi))] pub unsafe fn _mm_cvtsd_u32(a: __m128d) -> u32 { - transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)) + vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) } /// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\ @@ -37382,8 +37297,7 @@ pub unsafe fn _mm_cvt_roundu32_ss(a: __m128, b: u32) -> __m #[cfg_attr(test, assert_instr(vcvtsi2ss))] pub unsafe fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 { let b = b as f32; - let r = simd_insert(a, 0, b); - transmute(r) + simd_insert(a, 0, b) } /// Convert the signed 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. @@ -37394,8 +37308,7 @@ pub unsafe fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 { #[cfg_attr(test, assert_instr(vcvtsi2sd))] pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d { let b = b as f64; - let r = simd_insert(a, 0, b); - transmute(r) + simd_insert(a, 0, b) } /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\ @@ -37409,8 +37322,7 @@ pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d { pub unsafe fn _mm_cvtt_roundss_si32(a: __m128) -> i32 { static_assert_sae!(SAE); let a = a.as_f32x4(); - let r = vcvtss2si(a, SAE); - transmute(r) + vcvtss2si(a, SAE) } /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\ @@ -37424,8 +37336,7 @@ pub unsafe fn _mm_cvtt_roundss_si32(a: __m128) -> i32 { pub unsafe fn _mm_cvtt_roundss_i32(a: __m128) -> i32 { static_assert_sae!(SAE); let a = a.as_f32x4(); - let r = vcvtss2si(a, SAE); - transmute(r) + vcvtss2si(a, SAE) } /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\ @@ -37439,8 +37350,7 @@ pub unsafe fn _mm_cvtt_roundss_i32(a: __m128) -> i32 { pub unsafe fn _mm_cvtt_roundss_u32(a: __m128) -> u32 { static_assert_sae!(SAE); let a = a.as_f32x4(); - let r = vcvtss2usi(a, SAE); - transmute(r) + vcvtss2usi(a, SAE) } /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst. @@ -37450,7 +37360,7 @@ pub unsafe fn _mm_cvtt_roundss_u32(a: __m128) -> u32 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vcvtss2si))] pub unsafe fn _mm_cvttss_i32(a: __m128) -> i32 { - transmute(vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)) + vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) } /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst. @@ -37460,7 +37370,7 @@ pub unsafe fn _mm_cvttss_i32(a: __m128) -> i32 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vcvtss2usi))] pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 { - transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)) + vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) } /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\ @@ -37474,8 +37384,7 @@ pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 { pub unsafe fn _mm_cvtt_roundsd_si32(a: __m128d) -> i32 { static_assert_sae!(SAE); let a = a.as_f64x2(); - let r = vcvtsd2si(a, SAE); - transmute(r) + vcvtsd2si(a, SAE) } /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\ @@ -37489,8 +37398,7 @@ pub unsafe fn _mm_cvtt_roundsd_si32(a: __m128d) -> i32 { pub unsafe fn _mm_cvtt_roundsd_i32(a: __m128d) -> i32 { static_assert_sae!(SAE); let a = a.as_f64x2(); - let r = vcvtsd2si(a, SAE); - transmute(r) + vcvtsd2si(a, SAE) } /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\ @@ -37504,8 +37412,7 @@ pub unsafe fn _mm_cvtt_roundsd_i32(a: __m128d) -> i32 { pub unsafe fn _mm_cvtt_roundsd_u32(a: __m128d) -> u32 { static_assert_sae!(SAE); let a = a.as_f64x2(); - let r = vcvtsd2usi(a, SAE); - transmute(r) + vcvtsd2usi(a, SAE) } /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst. @@ -37515,7 +37422,7 @@ pub unsafe fn _mm_cvtt_roundsd_u32(a: __m128d) -> u32 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vcvtsd2si))] pub unsafe fn _mm_cvttsd_i32(a: __m128d) -> i32 { - transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)) + vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) } /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst. @@ -37525,7 +37432,7 @@ pub unsafe fn _mm_cvttsd_i32(a: __m128d) -> i32 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vcvtsd2usi))] pub unsafe fn _mm_cvttsd_u32(a: __m128d) -> u32 { - transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)) + vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) } /// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. @@ -37536,8 +37443,7 @@ pub unsafe fn _mm_cvttsd_u32(a: __m128d) -> u32 { #[cfg_attr(test, assert_instr(vcvtusi2ss))] pub unsafe fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 { let b = b as f32; - let r = simd_insert(a, 0, b); - transmute(r) + simd_insert(a, 0, b) } /// Convert the unsigned 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. @@ -37548,8 +37454,7 @@ pub unsafe fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 { #[cfg_attr(test, assert_instr(vcvtusi2sd))] pub unsafe fn _mm_cvtu32_sd(a: __m128d, b: u32) -> __m128d { let b = b as f64; - let r = simd_insert(a, 0, b); - transmute(r) + simd_insert(a, 0, b) } /// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\ @@ -37565,8 +37470,7 @@ pub unsafe fn _mm_comi_round_ss(a: __m128, b: _ static_assert_mantissas_sae!(SAE); let a = a.as_f32x4(); let b = b.as_f32x4(); - let r = vcomiss(a, b, IMM5, SAE); - transmute(r) + vcomiss(a, b, IMM5, SAE) } /// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\ @@ -37582,8 +37486,7 @@ pub unsafe fn _mm_comi_round_sd(a: __m128d, b: static_assert_mantissas_sae!(SAE); let a = a.as_f64x2(); let b = b.as_f64x2(); - let r = vcomisd(a, b, IMM5, SAE); - transmute(r) + vcomisd(a, b, IMM5, SAE) } /// Equal diff --git a/library/stdarch/crates/core_arch/src/x86/sse.rs b/library/stdarch/crates/core_arch/src/x86/sse.rs index 3d4471ba3..6a2be0921 100644 --- a/library/stdarch/crates/core_arch/src/x86/sse.rs +++ b/library/stdarch/crates/core_arch/src/x86/sse.rs @@ -790,8 +790,7 @@ pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 { /// /// The result is rounded according to the current rounding mode. If the result /// cannot be represented as a 32 bit integer the result will be `0x8000_0000` -/// (`i32::MIN`) or an invalid operation floating point exception if -/// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)). +/// (`i32::MIN`). /// /// This corresponds to the `CVTSS2SI` instruction (with 32 bit output). /// @@ -821,8 +820,7 @@ pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 { /// /// The result is rounded always using truncation (round towards zero). If the /// result cannot be represented as a 32 bit integer the result will be -/// `0x8000_0000` (`i32::MIN`) or an invalid operation floating point -/// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)). +/// `0x8000_0000` (`i32::MIN`). /// /// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output). /// @@ -1083,7 +1081,10 @@ pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 { #[cfg_attr(test, assert_instr(movmskps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 { - movmskps(a) + // Propagate the highest bit to the rest, because simd_bitmask + // requires all-1 or all-0. + let mask: i32x4 = simd_lt(transmute(a), i32x4::splat(0)); + simd_bitmask::(mask).into() } /// Construct a `__m128` with the lowest element read from `p` and the other @@ -1365,6 +1366,15 @@ pub unsafe fn _mm_sfence() { /// Gets the unsigned 32-bit value of the MXCSR control and status register. /// +/// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust +/// floating-point operations may or may not result in this register getting updated with exception +/// state, and the register can change between two invocations of this function even when no +/// floating-point operations appear in the source code (since floating-point operations appearing +/// earlier or later can be reordered). +/// +/// If you need to perform some floating-point operations and check whether they raised an +/// exception, use an inline assembly block for the entire sequence of operations. +/// /// For more info see [`_mm_setcsr`](fn._mm_setcsr.html) /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr) @@ -1372,6 +1382,10 @@ pub unsafe fn _mm_sfence() { #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(stmxcsr))] #[stable(feature = "simd_x86", since = "1.27.0")] +#[deprecated( + since = "1.75.0", + note = "see `_mm_getcsr` documentation - use inline assembly instead" +)] pub unsafe fn _mm_getcsr() -> u32 { let mut result = 0_i32; stmxcsr(&mut result as *mut _ as *mut i8); @@ -1401,6 +1415,16 @@ pub unsafe fn _mm_getcsr() -> u32 { /// * The *denormals-are-zero mode flag* turns all numbers which would be /// denormalized (exponent bits are all zeros) into zeros. /// +/// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to +/// **immediate Undefined Behavior**: Rust assumes that these are always in their default state and +/// will optimize accordingly. This even applies when the register is altered and later reset to its +/// original value without any floating-point operations appearing in the source code between those +/// operations (since floating-point operations appearing earlier or later can be reordered). +/// +/// If you need to perform some floating-point operations under a different masking flags, rounding +/// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the +/// original MXCSR register state before the end of the block. +/// /// ## Exception Flags /// /// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing @@ -1509,6 +1533,10 @@ pub unsafe fn _mm_getcsr() -> u32 { #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(ldmxcsr))] #[stable(feature = "simd_x86", since = "1.27.0")] +#[deprecated( + since = "1.75.0", + note = "see `_mm_setcsr` documentation - use inline assembly instead" +)] pub unsafe fn _mm_setcsr(val: u32) { ldmxcsr(&val as *const _ as *const i8); } @@ -1588,9 +1616,14 @@ pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000; /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK) #[inline] +#[allow(deprecated)] // Deprecated function implemented on top of deprecated function #[allow(non_snake_case)] #[target_feature(enable = "sse")] #[stable(feature = "simd_x86", since = "1.27.0")] +#[deprecated( + since = "1.75.0", + note = "see `_mm_getcsr` documentation - use inline assembly instead" +)] pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 { _mm_getcsr() & _MM_MASK_MASK } @@ -1599,9 +1632,14 @@ pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE) #[inline] +#[allow(deprecated)] // Deprecated function implemented on top of deprecated function #[allow(non_snake_case)] #[target_feature(enable = "sse")] #[stable(feature = "simd_x86", since = "1.27.0")] +#[deprecated( + since = "1.75.0", + note = "see `_mm_getcsr` documentation - use inline assembly instead" +)] pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 { _mm_getcsr() & _MM_EXCEPT_MASK } @@ -1610,9 +1648,14 @@ pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE) #[inline] +#[allow(deprecated)] // Deprecated function implemented on top of deprecated function #[allow(non_snake_case)] #[target_feature(enable = "sse")] #[stable(feature = "simd_x86", since = "1.27.0")] +#[deprecated( + since = "1.75.0", + note = "see `_mm_getcsr` documentation - use inline assembly instead" +)] pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 { _mm_getcsr() & _MM_FLUSH_ZERO_MASK } @@ -1621,9 +1664,14 @@ pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE) #[inline] +#[allow(deprecated)] // Deprecated function implemented on top of deprecated function #[allow(non_snake_case)] #[target_feature(enable = "sse")] #[stable(feature = "simd_x86", since = "1.27.0")] +#[deprecated( + since = "1.75.0", + note = "see `_mm_getcsr` documentation - use inline assembly instead" +)] pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 { _mm_getcsr() & _MM_ROUND_MASK } @@ -1632,9 +1680,14 @@ pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK) #[inline] +#[allow(deprecated)] // Deprecated function implemented on top of deprecated function #[allow(non_snake_case)] #[target_feature(enable = "sse")] #[stable(feature = "simd_x86", since = "1.27.0")] +#[deprecated( + since = "1.75.0", + note = "see `_mm_setcsr` documentation - use inline assembly instead" +)] pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) { _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x) } @@ -1643,9 +1696,14 @@ pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE) #[inline] +#[allow(deprecated)] // Deprecated function implemented on top of deprecated function #[allow(non_snake_case)] #[target_feature(enable = "sse")] #[stable(feature = "simd_x86", since = "1.27.0")] +#[deprecated( + since = "1.75.0", + note = "see `_mm_setcsr` documentation - use inline assembly instead" +)] pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) { _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x) } @@ -1654,9 +1712,14 @@ pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE) #[inline] +#[allow(deprecated)] // Deprecated function implemented on top of deprecated function #[allow(non_snake_case)] #[target_feature(enable = "sse")] #[stable(feature = "simd_x86", since = "1.27.0")] +#[deprecated( + since = "1.75.0", + note = "see `_mm_setcsr` documentation - use inline assembly instead" +)] pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) { let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x; // println!("setting csr={:x}", val); @@ -1667,9 +1730,14 @@ pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) { /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE) #[inline] +#[allow(deprecated)] // Deprecated function implemented on top of deprecated function #[allow(non_snake_case)] #[target_feature(enable = "sse")] #[stable(feature = "simd_x86", since = "1.27.0")] +#[deprecated( + since = "1.75.0", + note = "see `_mm_setcsr` documentation - use inline assembly instead" +)] pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) { _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x) } @@ -1820,8 +1888,6 @@ extern "C" { fn maxss(a: __m128, b: __m128) -> __m128; #[link_name = "llvm.x86.sse.max.ps"] fn maxps(a: __m128, b: __m128) -> __m128; - #[link_name = "llvm.x86.sse.movmsk.ps"] - fn movmskps(a: __m128) -> i32; #[link_name = "llvm.x86.sse.cmp.ps"] fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128; #[link_name = "llvm.x86.sse.comieq.ss"] @@ -1974,7 +2040,11 @@ mod tests { let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); let r = _mm_rcp_ss(a); let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0); - assert_eq_m128(r, e); + let rel_err = 0.00048828125; + assert_approx_eq!(get_m128(r, 0), get_m128(e, 0), 2. * rel_err); + for i in 1..4 { + assert_eq!(get_m128(r, i), get_m128(e, i)); + } } #[simd_test(enable = "sse")] @@ -2055,6 +2125,17 @@ mod tests { let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); let r = _mm_max_ps(a, b); assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0)); + + // Check SSE-specific semantics for -0.0 handling. + let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0); + let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0); + let r1: [u8; 16] = transmute(_mm_max_ps(a, b)); + let r2: [u8; 16] = transmute(_mm_max_ps(b, a)); + let a: [u8; 16] = transmute(a); + let b: [u8; 16] = transmute(b); + assert_eq!(r1, b); + assert_eq!(r2, a); + assert_ne!(a, b); // sanity check that -0.0 is actually present } #[simd_test(enable = "sse")] @@ -2098,12 +2179,12 @@ mod tests { let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0); let r: u32x4 = transmute(_mm_cmpeq_ss(a, b)); - let e: u32x4 = transmute(_mm_setr_ps(transmute(0u32), 2.0, 3.0, 4.0)); + let e: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0), 2.0, 3.0, 4.0)); assert_eq!(r, e); let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2)); - let e2: u32x4 = transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0)); + let e2: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0xffffffff), 2.0, 3.0, 4.0)); assert_eq!(r2, e2); } @@ -2119,15 +2200,15 @@ mod tests { let d1 = !0u32; // a.extract(0) < d.extract(0) let rb: u32x4 = transmute(_mm_cmplt_ss(a, b)); - let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); assert_eq!(rb, eb); let rc: u32x4 = transmute(_mm_cmplt_ss(a, c)); - let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); assert_eq!(rc, ec); let rd: u32x4 = transmute(_mm_cmplt_ss(a, d)); - let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); assert_eq!(rd, ed); } @@ -2143,15 +2224,15 @@ mod tests { let d1 = !0u32; // a.extract(0) <= d.extract(0) let rb: u32x4 = transmute(_mm_cmple_ss(a, b)); - let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); assert_eq!(rb, eb); let rc: u32x4 = transmute(_mm_cmple_ss(a, c)); - let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); assert_eq!(rc, ec); let rd: u32x4 = transmute(_mm_cmple_ss(a, d)); - let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); assert_eq!(rd, ed); } @@ -2167,15 +2248,15 @@ mod tests { let d1 = 0u32; // a.extract(0) > d.extract(0) let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b)); - let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); assert_eq!(rb, eb); let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c)); - let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); assert_eq!(rc, ec); let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d)); - let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); assert_eq!(rd, ed); } @@ -2191,15 +2272,15 @@ mod tests { let d1 = 0u32; // a.extract(0) >= d.extract(0) let rb: u32x4 = transmute(_mm_cmpge_ss(a, b)); - let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); assert_eq!(rb, eb); let rc: u32x4 = transmute(_mm_cmpge_ss(a, c)); - let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); assert_eq!(rc, ec); let rd: u32x4 = transmute(_mm_cmpge_ss(a, d)); - let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); assert_eq!(rd, ed); } @@ -2215,15 +2296,15 @@ mod tests { let d1 = !0u32; // a.extract(0) != d.extract(0) let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b)); - let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); assert_eq!(rb, eb); let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c)); - let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); assert_eq!(rc, ec); let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d)); - let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); assert_eq!(rd, ed); } @@ -2244,15 +2325,15 @@ mod tests { let d1 = 0u32; // a.extract(0) >= d.extract(0) let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b)); - let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); assert_eq!(rb, eb); let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c)); - let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); assert_eq!(rc, ec); let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d)); - let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); assert_eq!(rd, ed); } @@ -2273,15 +2354,15 @@ mod tests { let d1 = 0u32; // a.extract(0) > d.extract(0) let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b)); - let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); assert_eq!(rb, eb); let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c)); - let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); assert_eq!(rc, ec); let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d)); - let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); assert_eq!(rd, ed); } @@ -2302,15 +2383,15 @@ mod tests { let d1 = !0u32; // a.extract(0) <= d.extract(0) let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b)); - let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); assert_eq!(rb, eb); let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c)); - let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); assert_eq!(rc, ec); let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d)); - let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); assert_eq!(rd, ed); } @@ -2331,15 +2412,15 @@ mod tests { let d1 = !0u32; // a.extract(0) < d.extract(0) let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b)); - let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); assert_eq!(rb, eb); let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c)); - let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); assert_eq!(rc, ec); let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d)); - let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); assert_eq!(rd, ed); } @@ -2355,15 +2436,15 @@ mod tests { let d1 = !0u32; // a.extract(0) ord d.extract(0) let rb: u32x4 = transmute(_mm_cmpord_ss(a, b)); - let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); assert_eq!(rb, eb); let rc: u32x4 = transmute(_mm_cmpord_ss(a, c)); - let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); assert_eq!(rc, ec); let rd: u32x4 = transmute(_mm_cmpord_ss(a, d)); - let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); assert_eq!(rd, ed); } @@ -2379,15 +2460,15 @@ mod tests { let d1 = 0u32; // a.extract(0) unord d.extract(0) let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b)); - let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); + let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0)); assert_eq!(rb, eb); let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c)); - let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); + let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0)); assert_eq!(rc, ec); let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d)); - let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); + let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0)); assert_eq!(rd, ed); } @@ -2766,7 +2847,9 @@ mod tests { } } + #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions #[simd_test(enable = "sse")] + #[cfg_attr(miri, ignore)] // Uses _mm_setcsr, which is not supported by Miri unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() { // If one of the arguments is a quiet NaN `comieq_ss` should signal an // Invalid Operation Exception while `ucomieq_ss` should not. @@ -3072,7 +3155,7 @@ mod tests { let mut p = vals.as_mut_ptr(); if (p as usize) & 0xf != 0 { - ofs = ((16 - (p as usize)) & 0xf) >> 2; + ofs = (16 - ((p as usize) & 0xf)) >> 2; p = p.add(ofs); } @@ -3098,7 +3181,7 @@ mod tests { // Align p to 16-byte boundary if (p as usize) & 0xf != 0 { - ofs = ((16 - (p as usize)) & 0xf) >> 2; + ofs = (16 - ((p as usize) & 0xf)) >> 2; p = p.add(ofs); } @@ -3124,7 +3207,7 @@ mod tests { // Align p to 16-byte boundary if (p as usize) & 0xf != 0 { - ofs = ((16 - (p as usize)) & 0xf) >> 2; + ofs = (16 - ((p as usize) & 0xf)) >> 2; p = p.add(ofs); } @@ -3186,11 +3269,15 @@ mod tests { } #[simd_test(enable = "sse")] + // Miri cannot support this until it is clear how it fits in the Rust memory model + #[cfg_attr(miri, ignore)] unsafe fn test_mm_sfence() { _mm_sfence(); } + #[allow(deprecated)] // FIXME: This tests functions that are immediate UB #[simd_test(enable = "sse")] + #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR unsafe fn test_mm_getcsr_setcsr_1() { let saved_csr = _mm_getcsr(); @@ -3206,7 +3293,9 @@ mod tests { assert_eq_m128(r, exp); // first component is a denormalized f32 } + #[allow(deprecated)] // FIXME: This tests functions that are immediate UB #[simd_test(enable = "sse")] + #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR unsafe fn test_mm_getcsr_setcsr_2() { // Same as _mm_setcsr_1 test, but with opposite flag value. @@ -3224,7 +3313,9 @@ mod tests { assert_eq_m128(r, exp); // first component is a denormalized f32 } + #[allow(deprecated)] // FIXME: This tests functions that are immediate UB #[simd_test(enable = "sse")] + #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR unsafe fn test_mm_getcsr_setcsr_underflow() { _MM_SET_EXCEPTION_STATE(0); @@ -3263,6 +3354,9 @@ mod tests { } #[simd_test(enable = "sse")] + // Miri cannot support this until it is clear how it fits in the Rust memory model + // (non-temporal store) + #[cfg_attr(miri, ignore)] unsafe fn test_mm_stream_ps() { let a = _mm_set1_ps(7.0); let mut mem = Memory { data: [-1.0; 4] }; diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs index 3d572a1f5..7831ea743 100644 --- a/library/stdarch/crates/core_arch/src/x86/sse2.rs +++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs @@ -165,7 +165,10 @@ pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pavgb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i { - transmute(pavgb(a.as_u8x16(), b.as_u8x16())) + let a = simd_cast::<_, u16x16>(a.as_u8x16()); + let b = simd_cast::<_, u16x16>(b.as_u8x16()); + let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1)); + transmute(simd_cast::<_, u8x16>(r)) } /// Averages packed unsigned 16-bit integers in `a` and `b`. @@ -176,7 +179,10 @@ pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pavgw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i { - transmute(pavgw(a.as_u16x8(), b.as_u16x8())) + let a = simd_cast::<_, u32x8>(a.as_u16x8()); + let b = simd_cast::<_, u32x8>(b.as_u16x8()); + let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1)); + transmute(simd_cast::<_, u16x8>(r)) } /// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`. @@ -261,7 +267,10 @@ pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pmulhw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i { - transmute(pmulhw(a.as_i16x8(), b.as_i16x8())) + let a = simd_cast::<_, i32x8>(a.as_i16x8()); + let b = simd_cast::<_, i32x8>(b.as_i16x8()); + let r = simd_shr(simd_mul(a, b), i32x8::splat(16)); + transmute(simd_cast::(r)) } /// Multiplies the packed unsigned 16-bit integers in `a` and `b`. @@ -275,7 +284,10 @@ pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pmulhuw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i { - transmute(pmulhuw(a.as_u16x8(), b.as_u16x8())) + let a = simd_cast::<_, u32x8>(a.as_u16x8()); + let b = simd_cast::<_, u32x8>(b.as_u16x8()); + let r = simd_shr(simd_mul(a, b), u32x8::splat(16)); + transmute(simd_cast::(r)) } /// Multiplies the packed 16-bit integers in `a` and `b`. @@ -303,7 +315,10 @@ pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pmuludq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i { - transmute(pmuludq(a.as_u32x4(), b.as_u32x4())) + let a = a.as_u64x2(); + let b = b.as_u64x2(); + let mask = u64x2::splat(u32::MAX.into()); + transmute(simd_mul(simd_and(a, mask), simd_and(b, mask))) } /// Sum the absolute differences of packed unsigned 8-bit integers. @@ -952,7 +967,7 @@ pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d { #[cfg_attr(test, assert_instr(cvtdq2ps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cvtepi32_ps(a: __m128i) -> __m128 { - cvtdq2ps(a.as_i32x4()) + transmute(simd_cast::<_, f32x4>(a.as_i32x4())) } /// Converts packed single-precision (32-bit) floating-point elements in `a` @@ -2240,7 +2255,9 @@ pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 { #[cfg_attr(test, assert_instr(cvtpd2ps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 { - cvtpd2ps(a) + let r = simd_cast::<_, f32x2>(a.as_f64x2()); + let zero = f32x2::new(0.0, 0.0); + transmute::(simd_shuffle!(r, zero, [0, 1, 2, 3])) } /// Converts packed single-precision (32-bit) floating-point elements in `a` to @@ -2253,7 +2270,8 @@ pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 { #[cfg_attr(test, assert_instr(cvtps2pd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cvtps_pd(a: __m128) -> __m128d { - cvtps2pd(a) + let a = a.as_f32x4(); + transmute(simd_cast::(simd_shuffle!(a, a, [0, 1]))) } /// Converts packed double-precision (64-bit) floating-point elements in `a` to @@ -2432,7 +2450,10 @@ pub unsafe fn _mm_setzero_pd() -> __m128d { #[cfg_attr(test, assert_instr(movmskpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 { - movmskpd(a) + // Propagate the highest bit to the rest, because simd_bitmask + // requires all-1 or all-0. + let mask: i64x2 = simd_lt(transmute(a), i64x2::splat(0)); + simd_bitmask::(mask).into() } /// Loads 128-bits (composed of 2 packed double-precision (64-bit) @@ -2826,18 +2847,8 @@ extern "C" { fn lfence(); #[link_name = "llvm.x86.sse2.mfence"] fn mfence(); - #[link_name = "llvm.x86.sse2.pavg.b"] - fn pavgb(a: u8x16, b: u8x16) -> u8x16; - #[link_name = "llvm.x86.sse2.pavg.w"] - fn pavgw(a: u16x8, b: u16x8) -> u16x8; #[link_name = "llvm.x86.sse2.pmadd.wd"] fn pmaddwd(a: i16x8, b: i16x8) -> i32x4; - #[link_name = "llvm.x86.sse2.pmulh.w"] - fn pmulhw(a: i16x8, b: i16x8) -> i16x8; - #[link_name = "llvm.x86.sse2.pmulhu.w"] - fn pmulhuw(a: u16x8, b: u16x8) -> u16x8; - #[link_name = "llvm.x86.sse2.pmulu.dq"] - fn pmuludq(a: u32x4, b: u32x4) -> u64x2; #[link_name = "llvm.x86.sse2.psad.bw"] fn psadbw(a: u8x16, b: u8x16) -> u64x2; #[link_name = "llvm.x86.sse2.psll.w"] @@ -2856,8 +2867,6 @@ extern "C" { fn psrld(a: i32x4, count: i32x4) -> i32x4; #[link_name = "llvm.x86.sse2.psrl.q"] fn psrlq(a: i64x2, count: i64x2) -> i64x2; - #[link_name = "llvm.x86.sse2.cvtdq2ps"] - fn cvtdq2ps(a: i32x4) -> __m128; #[link_name = "llvm.x86.sse2.cvtps2dq"] fn cvtps2dq(a: __m128) -> i32x4; #[link_name = "llvm.x86.sse2.maskmov.dqu"] @@ -2908,12 +2917,6 @@ extern "C" { fn ucomigesd(a: __m128d, b: __m128d) -> i32; #[link_name = "llvm.x86.sse2.ucomineq.sd"] fn ucomineqsd(a: __m128d, b: __m128d) -> i32; - #[link_name = "llvm.x86.sse2.movmsk.pd"] - fn movmskpd(a: __m128d) -> i32; - #[link_name = "llvm.x86.sse2.cvtpd2ps"] - fn cvtpd2ps(a: __m128d) -> __m128; - #[link_name = "llvm.x86.sse2.cvtps2pd"] - fn cvtps2pd(a: __m128) -> __m128d; #[link_name = "llvm.x86.sse2.cvtpd2dq"] fn cvtpd2dq(a: __m128d) -> i32x4; #[link_name = "llvm.x86.sse2.cvtsd2si"] @@ -2956,11 +2959,15 @@ mod tests { } #[simd_test(enable = "sse2")] + // Miri cannot support this until it is clear how it fits in the Rust memory model + #[cfg_attr(miri, ignore)] unsafe fn test_mm_lfence() { _mm_lfence(); } #[simd_test(enable = "sse2")] + // Miri cannot support this until it is clear how it fits in the Rust memory model + #[cfg_attr(miri, ignore)] unsafe fn test_mm_mfence() { _mm_mfence(); } @@ -3343,83 +3350,124 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_slli_epi16() { - #[rustfmt::skip] - let a = _mm_setr_epi16( - 0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0, - ); + let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF); let r = _mm_slli_epi16::<4>(a); - - #[rustfmt::skip] - let e = _mm_setr_epi16( - 0xFFF0 as u16 as i16, 0xFFF0 as u16 as i16, 0x0FF0, 0x00F0, - 0, 0, 0, 0, + assert_eq_m128i( + r, + _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0), ); - assert_eq_m128i(r, e); + let r = _mm_slli_epi16::<16>(a); + assert_eq_m128i(r, _mm_set1_epi16(0)); } #[simd_test(enable = "sse2")] unsafe fn test_mm_sll_epi16() { - let a = _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0); - let r = _mm_sll_epi16(a, _mm_setr_epi16(4, 0, 0, 0, 0, 0, 0, 0)); - assert_eq_m128i(r, _mm_setr_epi16(0xFF0, 0, 0, 0, 0, 0, 0, 0)); - let r = _mm_sll_epi16(a, _mm_setr_epi16(0, 0, 0, 0, 4, 0, 0, 0)); - assert_eq_m128i(r, _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0)); + let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF); + let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4)); + assert_eq_m128i( + r, + _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0), + ); + let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0)); + assert_eq_m128i(r, a); + let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16)); + assert_eq_m128i(r, _mm_set1_epi16(0)); + let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m128i(r, _mm_set1_epi16(0)); } #[simd_test(enable = "sse2")] unsafe fn test_mm_slli_epi32() { - let r = _mm_slli_epi32::<4>(_mm_set1_epi32(0xFFFF)); - assert_eq_m128i(r, _mm_set1_epi32(0xFFFF0)); + let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF); + let r = _mm_slli_epi32::<4>(a); + assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0)); + let r = _mm_slli_epi32::<32>(a); + assert_eq_m128i(r, _mm_set1_epi32(0)); } #[simd_test(enable = "sse2")] unsafe fn test_mm_sll_epi32() { - let a = _mm_set1_epi32(0xFFFF); - let b = _mm_setr_epi32(4, 0, 0, 0); - let r = _mm_sll_epi32(a, b); - assert_eq_m128i(r, _mm_set1_epi32(0xFFFF0)); + let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF); + let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4)); + assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0)); + let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0)); + assert_eq_m128i(r, a); + let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32)); + assert_eq_m128i(r, _mm_set1_epi32(0)); + let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m128i(r, _mm_set1_epi32(0)); } #[simd_test(enable = "sse2")] unsafe fn test_mm_slli_epi64() { - let r = _mm_slli_epi64::<4>(_mm_set1_epi64x(0xFFFFFFFF)); - assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFFF0)); + let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF); + let r = _mm_slli_epi64::<4>(a); + assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0)); + let r = _mm_slli_epi64::<64>(a); + assert_eq_m128i(r, _mm_set1_epi64x(0)); } #[simd_test(enable = "sse2")] unsafe fn test_mm_sll_epi64() { - let a = _mm_set1_epi64x(0xFFFFFFFF); - let b = _mm_setr_epi64x(4, 0); - let r = _mm_sll_epi64(a, b); - assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFFF0)); + let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF); + let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4)); + assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0)); + let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0)); + assert_eq_m128i(r, a); + let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64)); + assert_eq_m128i(r, _mm_set1_epi64x(0)); + let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m128i(r, _mm_set1_epi64x(0)); } #[simd_test(enable = "sse2")] unsafe fn test_mm_srai_epi16() { - let r = _mm_srai_epi16::<1>(_mm_set1_epi16(-1)); - assert_eq_m128i(r, _mm_set1_epi16(-1)); + let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF); + let r = _mm_srai_epi16::<4>(a); + assert_eq_m128i( + r, + _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10), + ); + let r = _mm_srai_epi16::<16>(a); + assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1)); } #[simd_test(enable = "sse2")] unsafe fn test_mm_sra_epi16() { - let a = _mm_set1_epi16(-1); - let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); - let r = _mm_sra_epi16(a, b); - assert_eq_m128i(r, _mm_set1_epi16(-1)); + let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF); + let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4)); + assert_eq_m128i( + r, + _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10), + ); + let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0)); + assert_eq_m128i(r, a); + let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16)); + assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1)); + let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1)); } #[simd_test(enable = "sse2")] unsafe fn test_mm_srai_epi32() { - let r = _mm_srai_epi32::<1>(_mm_set1_epi32(-1)); - assert_eq_m128i(r, _mm_set1_epi32(-1)); + let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF); + let r = _mm_srai_epi32::<4>(a); + assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000)); + let r = _mm_srai_epi32::<32>(a); + assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1)); } #[simd_test(enable = "sse2")] unsafe fn test_mm_sra_epi32() { - let a = _mm_set1_epi32(-1); - let b = _mm_setr_epi32(1, 0, 0, 0); - let r = _mm_sra_epi32(a, b); - assert_eq_m128i(r, _mm_set1_epi32(-1)); + let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF); + let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4)); + assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000)); + let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0)); + assert_eq_m128i(r, a); + let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32)); + assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1)); + let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1)); } #[simd_test(enable = "sse2")] @@ -3453,53 +3501,74 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_srli_epi16() { - #[rustfmt::skip] - let a = _mm_setr_epi16( - 0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0, - ); + let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF); let r = _mm_srli_epi16::<4>(a); - #[rustfmt::skip] - let e = _mm_setr_epi16( - 0xFFF as u16 as i16, 0xFF as u16 as i16, 0xF, 0, 0, 0, 0, 0, + assert_eq_m128i( + r, + _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0), ); - assert_eq_m128i(r, e); + let r = _mm_srli_epi16::<16>(a); + assert_eq_m128i(r, _mm_set1_epi16(0)); } #[simd_test(enable = "sse2")] unsafe fn test_mm_srl_epi16() { - let a = _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0); - let r = _mm_srl_epi16(a, _mm_setr_epi16(4, 0, 0, 0, 0, 0, 0, 0)); - assert_eq_m128i(r, _mm_setr_epi16(0xF, 0, 0, 0, 0, 0, 0, 0)); - let r = _mm_srl_epi16(a, _mm_setr_epi16(0, 0, 0, 0, 4, 0, 0, 0)); - assert_eq_m128i(r, _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0)); + let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF); + let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4)); + assert_eq_m128i( + r, + _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0), + ); + let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0)); + assert_eq_m128i(r, a); + let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16)); + assert_eq_m128i(r, _mm_set1_epi16(0)); + let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m128i(r, _mm_set1_epi16(0)); } #[simd_test(enable = "sse2")] unsafe fn test_mm_srli_epi32() { - let r = _mm_srli_epi32::<4>(_mm_set1_epi32(0xFFFF)); - assert_eq_m128i(r, _mm_set1_epi32(0xFFF)); + let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF); + let r = _mm_srli_epi32::<4>(a); + assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000)); + let r = _mm_srli_epi32::<32>(a); + assert_eq_m128i(r, _mm_set1_epi32(0)); } #[simd_test(enable = "sse2")] unsafe fn test_mm_srl_epi32() { - let a = _mm_set1_epi32(0xFFFF); - let b = _mm_setr_epi32(4, 0, 0, 0); - let r = _mm_srl_epi32(a, b); - assert_eq_m128i(r, _mm_set1_epi32(0xFFF)); + let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF); + let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4)); + assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000)); + let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0)); + assert_eq_m128i(r, a); + let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32)); + assert_eq_m128i(r, _mm_set1_epi32(0)); + let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m128i(r, _mm_set1_epi32(0)); } #[simd_test(enable = "sse2")] unsafe fn test_mm_srli_epi64() { - let r = _mm_srli_epi64::<4>(_mm_set1_epi64x(0xFFFFFFFF)); - assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFF)); + let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF); + let r = _mm_srli_epi64::<4>(a); + assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000)); + let r = _mm_srli_epi64::<64>(a); + assert_eq_m128i(r, _mm_set1_epi64x(0)); } #[simd_test(enable = "sse2")] unsafe fn test_mm_srl_epi64() { - let a = _mm_set1_epi64x(0xFFFFFFFF); - let b = _mm_setr_epi64x(4, 0); - let r = _mm_srl_epi64(a, b); - assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFF)); + let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF); + let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4)); + assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000)); + let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0)); + assert_eq_m128i(r, a); + let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64)); + assert_eq_m128i(r, _mm_set1_epi64x(0)); + let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m128i(r, _mm_set1_epi64x(0)); } #[simd_test(enable = "sse2")] @@ -3766,6 +3835,9 @@ mod tests { } #[simd_test(enable = "sse2")] + // Miri cannot support this until it is clear how it fits in the Rust memory model + // (non-temporal store) + #[cfg_attr(miri, ignore)] unsafe fn test_mm_maskmoveu_si128() { let a = _mm_set1_epi8(9); #[rustfmt::skip] @@ -3804,6 +3876,9 @@ mod tests { } #[simd_test(enable = "sse2")] + // Miri cannot support this until it is clear how it fits in the Rust memory model + // (non-temporal store) + #[cfg_attr(miri, ignore)] unsafe fn test_mm_stream_si128() { let a = _mm_setr_epi32(1, 2, 3, 4); let mut r = _mm_undefined_si128(); @@ -3812,6 +3887,9 @@ mod tests { } #[simd_test(enable = "sse2")] + // Miri cannot support this until it is clear how it fits in the Rust memory model + // (non-temporal store) + #[cfg_attr(miri, ignore)] unsafe fn test_mm_stream_si32() { let a: i32 = 7; let mut mem = boxed::Box::::new(-1); @@ -4055,6 +4133,17 @@ mod tests { let b = _mm_setr_pd(5.0, 10.0); let r = _mm_max_pd(a, b); assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0)); + + // Check SSE(2)-specific semantics for -0.0 handling. + let a = _mm_setr_pd(-0.0, 0.0); + let b = _mm_setr_pd(0.0, 0.0); + let r1: [u8; 16] = transmute(_mm_max_pd(a, b)); + let r2: [u8; 16] = transmute(_mm_max_pd(b, a)); + let a: [u8; 16] = transmute(a); + let b: [u8; 16] = transmute(b); + assert_eq!(r1, b); + assert_eq!(r2, a); + assert_ne!(a, b); // sanity check that -0.0 is actually present } #[simd_test(enable = "sse2")] @@ -4071,6 +4160,17 @@ mod tests { let b = _mm_setr_pd(5.0, 10.0); let r = _mm_min_pd(a, b); assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0)); + + // Check SSE(2)-specific semantics for -0.0 handling. + let a = _mm_setr_pd(-0.0, 0.0); + let b = _mm_setr_pd(0.0, 0.0); + let r1: [u8; 16] = transmute(_mm_min_pd(a, b)); + let r2: [u8; 16] = transmute(_mm_min_pd(b, a)); + let a: [u8; 16] = transmute(a); + let b: [u8; 16] = transmute(b); + assert_eq!(r1, b); + assert_eq!(r2, a); + assert_ne!(a, b); // sanity check that -0.0 is actually present } #[simd_test(enable = "sse2")] @@ -4158,7 +4258,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_cmpeq_sd() { let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); - let e = _mm_setr_epi64x(!0, transmute(2.0f64)); + let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64); let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b)); assert_eq_m128i(r, e); } @@ -4166,7 +4266,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_cmplt_sd() { let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0)); - let e = _mm_setr_epi64x(!0, transmute(2.0f64)); + let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64); let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b)); assert_eq_m128i(r, e); } @@ -4174,7 +4274,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_cmple_sd() { let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); - let e = _mm_setr_epi64x(!0, transmute(2.0f64)); + let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64); let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b)); assert_eq_m128i(r, e); } @@ -4182,7 +4282,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_cmpgt_sd() { let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0)); - let e = _mm_setr_epi64x(!0, transmute(2.0f64)); + let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64); let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b)); assert_eq_m128i(r, e); } @@ -4190,7 +4290,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_cmpge_sd() { let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); - let e = _mm_setr_epi64x(!0, transmute(2.0f64)); + let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64); let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b)); assert_eq_m128i(r, e); } @@ -4198,7 +4298,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_cmpord_sd() { let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0)); - let e = _mm_setr_epi64x(0, transmute(2.0f64)); + let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64); let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b)); assert_eq_m128i(r, e); } @@ -4206,7 +4306,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_cmpunord_sd() { let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0)); - let e = _mm_setr_epi64x(!0, transmute(2.0f64)); + let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64); let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b)); assert_eq_m128i(r, e); } @@ -4214,7 +4314,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_cmpneq_sd() { let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0)); - let e = _mm_setr_epi64x(!0, transmute(2.0f64)); + let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64); let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b)); assert_eq_m128i(r, e); } @@ -4222,7 +4322,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_cmpnlt_sd() { let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0)); - let e = _mm_setr_epi64x(0, transmute(2.0f64)); + let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64); let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b)); assert_eq_m128i(r, e); } @@ -4230,7 +4330,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_cmpnle_sd() { let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); - let e = _mm_setr_epi64x(0, transmute(2.0f64)); + let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64); let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b)); assert_eq_m128i(r, e); } @@ -4238,7 +4338,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_cmpngt_sd() { let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0)); - let e = _mm_setr_epi64x(0, transmute(2.0f64)); + let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64); let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b)); assert_eq_m128i(r, e); } @@ -4246,7 +4346,7 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_cmpnge_sd() { let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0)); - let e = _mm_setr_epi64x(0, transmute(2.0f64)); + let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64); let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b)); assert_eq_m128i(r, e); } @@ -4478,6 +4578,9 @@ mod tests { } #[simd_test(enable = "sse2")] + // Miri cannot support this until it is clear how it fits in the Rust memory model + // (non-temporal store) + #[cfg_attr(miri, ignore)] unsafe fn test_mm_stream_pd() { #[repr(align(128))] struct Memory { diff --git a/library/stdarch/crates/core_arch/src/x86/sse3.rs b/library/stdarch/crates/core_arch/src/x86/sse3.rs index 092a8d9cd..df0d78e5b 100644 --- a/library/stdarch/crates/core_arch/src/x86/sse3.rs +++ b/library/stdarch/crates/core_arch/src/x86/sse3.rs @@ -1,7 +1,7 @@ //! Streaming SIMD Extensions 3 (SSE3) use crate::{ - core_arch::{simd::*, simd_llvm::simd_shuffle, x86::*}, + core_arch::{simd::*, simd_llvm::*, x86::*}, mem::transmute, }; @@ -17,7 +17,11 @@ use stdarch_test::assert_instr; #[cfg_attr(test, assert_instr(addsubps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 { - addsubps(a, b) + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let add = simd_add(a, b); + let sub = simd_sub(a, b); + simd_shuffle!(add, sub, [4, 1, 6, 3]) } /// Alternatively add and subtract packed double-precision (64-bit) @@ -29,7 +33,11 @@ pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 { #[cfg_attr(test, assert_instr(addsubpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d { - addsubpd(a, b) + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let add = simd_add(a, b); + let sub = simd_sub(a, b); + simd_shuffle!(add, sub, [2, 1]) } /// Horizontally adds adjacent pairs of double-precision (64-bit) @@ -143,10 +151,6 @@ pub unsafe fn _mm_moveldup_ps(a: __m128) -> __m128 { #[allow(improper_ctypes)] extern "C" { - #[link_name = "llvm.x86.sse3.addsub.ps"] - fn addsubps(a: __m128, b: __m128) -> __m128; - #[link_name = "llvm.x86.sse3.addsub.pd"] - fn addsubpd(a: __m128d, b: __m128d) -> __m128d; #[link_name = "llvm.x86.sse3.hadd.pd"] fn haddpd(a: __m128d, b: __m128d) -> __m128d; #[link_name = "llvm.x86.sse3.hadd.ps"] diff --git a/library/stdarch/crates/core_arch/src/x86/sse41.rs b/library/stdarch/crates/core_arch/src/x86/sse41.rs index 7ba86e5f7..6d33238b0 100644 --- a/library/stdarch/crates/core_arch/src/x86/sse41.rs +++ b/library/stdarch/crates/core_arch/src/x86/sse41.rs @@ -62,7 +62,8 @@ pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTI #[cfg_attr(test, assert_instr(pblendvb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i { - transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16())) + let mask: i8x16 = simd_lt(mask.as_i8x16(), i8x16::splat(0)); + transmute(simd_select(mask, b.as_i8x16(), a.as_i8x16())) } /// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`. @@ -74,15 +75,25 @@ pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16) #[inline] #[target_feature(enable = "sse4.1")] -// Note: LLVM7 prefers the single-precision floating-point domain when possible -// see https://bugs.llvm.org/show_bug.cgi?id=38195 -// #[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xF0))] -#[cfg_attr(test, assert_instr(blendps, IMM8 = 0xF0))] +#[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xB1))] #[rustc_legacy_const_generics(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i) -> __m128i { static_assert_uimm_bits!(IMM8, 8); - transmute(pblendw(a.as_i16x8(), b.as_i16x8(), IMM8 as u8)) + transmute::(simd_shuffle!( + a.as_i16x8(), + b.as_i16x8(), + [ + [0, 8][IMM8 as usize & 1], + [1, 9][(IMM8 >> 1) as usize & 1], + [2, 10][(IMM8 >> 2) as usize & 1], + [3, 11][(IMM8 >> 3) as usize & 1], + [4, 12][(IMM8 >> 4) as usize & 1], + [5, 13][(IMM8 >> 5) as usize & 1], + [6, 14][(IMM8 >> 6) as usize & 1], + [7, 15][(IMM8 >> 7) as usize & 1], + ] + )) } /// Blend packed double-precision (64-bit) floating-point elements from `a` @@ -94,7 +105,8 @@ pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i) -> __m128 #[cfg_attr(test, assert_instr(blendvpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d { - blendvpd(a, b, mask) + let mask: i64x2 = simd_lt(transmute::<_, i64x2>(mask), i64x2::splat(0)); + transmute(simd_select(mask, b.as_f64x2(), a.as_f64x2())) } /// Blend packed single-precision (32-bit) floating-point elements from `a` @@ -106,7 +118,8 @@ pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d { #[cfg_attr(test, assert_instr(blendvps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 { - blendvps(a, b, mask) + let mask: i32x4 = simd_lt(transmute::<_, i32x4>(mask), i32x4::splat(0)); + transmute(simd_select(mask, b.as_f32x4(), a.as_f32x4())) } /// Blend packed double-precision (64-bit) floating-point elements from `a` @@ -123,7 +136,11 @@ pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 { #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d) -> __m128d { static_assert_uimm_bits!(IMM2, 2); - blendpd(a, b, IMM2 as u8) + transmute::(simd_shuffle!( + a.as_f64x2(), + b.as_f64x2(), + [[0, 2][IMM2 as usize & 1], [1, 3][(IMM2 >> 1) as usize & 1]] + )) } /// Blend packed single-precision (32-bit) floating-point elements from `a` @@ -137,7 +154,16 @@ pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d) -> __m128d { #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blend_ps(a: __m128, b: __m128) -> __m128 { static_assert_uimm_bits!(IMM4, 4); - blendps(a, b, IMM4 as u8) + transmute::(simd_shuffle!( + a.as_f32x4(), + b.as_f32x4(), + [ + [0, 4][IMM4 as usize & 1], + [1, 5][(IMM4 >> 1) as usize & 1], + [2, 6][(IMM4 >> 2) as usize & 1], + [3, 7][(IMM4 >> 3) as usize & 1], + ] + )) } /// Extracts a single-precision (32-bit) floating-point element from `a`, @@ -175,7 +201,7 @@ pub unsafe fn _mm_blend_ps(a: __m128, b: __m128) -> __m128 { #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_extract_ps(a: __m128) -> i32 { static_assert_uimm_bits!(IMM8, 2); - transmute(simd_extract::<_, f32>(a, IMM8 as u32)) + simd_extract::<_, f32>(a, IMM8 as u32).to_bits() as i32 } /// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit @@ -923,7 +949,9 @@ pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pmuldq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i { - transmute(pmuldq(a.as_i32x4(), b.as_i32x4())) + let a = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2())); + let b = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2())); + transmute(simd_mul(a, b)) } /// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate @@ -1124,18 +1152,6 @@ pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 { #[allow(improper_ctypes)] extern "C" { - #[link_name = "llvm.x86.sse41.pblendvb"] - fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16; - #[link_name = "llvm.x86.sse41.blendvpd"] - fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d; - #[link_name = "llvm.x86.sse41.blendvps"] - fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128; - #[link_name = "llvm.x86.sse41.blendpd"] - fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d; - #[link_name = "llvm.x86.sse41.blendps"] - fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128; - #[link_name = "llvm.x86.sse41.pblendw"] - fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8; #[link_name = "llvm.x86.sse41.insertps"] fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128; #[link_name = "llvm.x86.sse41.packusdw"] @@ -1154,8 +1170,6 @@ extern "C" { fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128; #[link_name = "llvm.x86.sse41.phminposuw"] fn phminposuw(a: u16x8) -> u16x8; - #[link_name = "llvm.x86.sse41.pmuldq"] - fn pmuldq(a: i32x4, b: i32x4) -> i64x2; #[link_name = "llvm.x86.sse41.mpsadbw"] fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8; #[link_name = "llvm.x86.sse41.ptestz"] @@ -1245,9 +1259,9 @@ mod tests { #[simd_test(enable = "sse4.1")] unsafe fn test_mm_extract_ps() { let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0); - let r: f32 = transmute(_mm_extract_ps::<1>(a)); + let r: f32 = f32::from_bits(_mm_extract_ps::<1>(a) as u32); assert_eq!(r, 1.0); - let r: f32 = transmute(_mm_extract_ps::<3>(a)); + let r: f32 = f32::from_bits(_mm_extract_ps::<3>(a) as u32); assert_eq!(r, 3.0); } @@ -1668,6 +1682,7 @@ mod tests { assert_eq_m128(r, e); } + #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions #[simd_test(enable = "sse4.1")] unsafe fn test_mm_round_sd() { let a = _mm_setr_pd(1.5, 3.5); @@ -1680,6 +1695,7 @@ mod tests { assert_eq_m128d(r, e); } + #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions #[simd_test(enable = "sse4.1")] unsafe fn test_mm_round_ss() { let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5); diff --git a/library/stdarch/crates/core_arch/src/x86/test.rs b/library/stdarch/crates/core_arch/src/x86/test.rs index ec4298033..50b2d93be 100644 --- a/library/stdarch/crates/core_arch/src/x86/test.rs +++ b/library/stdarch/crates/core_arch/src/x86/test.rs @@ -3,11 +3,13 @@ use crate::core_arch::x86::*; use std::mem::transmute; +#[track_caller] #[target_feature(enable = "sse2")] pub unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) { assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b)) } +#[track_caller] #[target_feature(enable = "sse2")] pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) { if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 { @@ -20,6 +22,7 @@ pub unsafe fn get_m128d(a: __m128d, idx: usize) -> f64 { transmute::<_, [f64; 2]>(a)[idx] } +#[track_caller] #[target_feature(enable = "sse")] pub unsafe fn assert_eq_m128(a: __m128, b: __m128) { let r = _mm_cmpeq_ps(a, b); @@ -40,11 +43,13 @@ pub unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i { _mm_set_epi64x(b, a) } +#[track_caller] #[target_feature(enable = "avx")] pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) { assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b)) } +#[track_caller] #[target_feature(enable = "avx")] pub unsafe fn assert_eq_m256d(a: __m256d, b: __m256d) { let cmp = _mm256_cmp_pd::<_CMP_EQ_OQ>(a, b); @@ -58,6 +63,7 @@ pub unsafe fn get_m256d(a: __m256d, idx: usize) -> f64 { transmute::<_, [f64; 4]>(a)[idx] } +#[track_caller] #[target_feature(enable = "avx")] pub unsafe fn assert_eq_m256(a: __m256, b: __m256) { let cmp = _mm256_cmp_ps::<_CMP_EQ_OQ>(a, b); @@ -125,10 +131,12 @@ mod x86_polyfill { } pub use self::x86_polyfill::*; +#[track_caller] pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) { assert_eq!(transmute::<_, [i32; 16]>(a), transmute::<_, [i32; 16]>(b)) } +#[track_caller] pub unsafe fn assert_eq_m512(a: __m512, b: __m512) { let cmp = _mm512_cmp_ps_mask::<_CMP_EQ_OQ>(a, b); if cmp != 0b11111111_11111111 { @@ -136,6 +144,7 @@ pub unsafe fn assert_eq_m512(a: __m512, b: __m512) { } } +#[track_caller] pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) { let cmp = _mm512_cmp_pd_mask::<_CMP_EQ_OQ>(a, b); if cmp != 0b11111111 { diff --git a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs index 68f332767..bace11d13 100644 --- a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs +++ b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs @@ -33,7 +33,7 @@ pub unsafe fn _mm_cvtss_i64(a: __m128) -> i64 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vcvtss2usi))] pub unsafe fn _mm_cvtss_u64(a: __m128) -> u64 { - transmute(vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)) + vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) } /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst. @@ -43,7 +43,7 @@ pub unsafe fn _mm_cvtss_u64(a: __m128) -> u64 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vcvtsd2usi))] pub unsafe fn _mm_cvtsd_u64(a: __m128d) -> u64 { - transmute(vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)) + vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) } /// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. @@ -54,8 +54,7 @@ pub unsafe fn _mm_cvtsd_u64(a: __m128d) -> u64 { #[cfg_attr(test, assert_instr(vcvtsi2ss))] pub unsafe fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 { let b = b as f32; - let r = simd_insert(a, 0, b); - transmute(r) + simd_insert(a, 0, b) } /// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. @@ -66,8 +65,7 @@ pub unsafe fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 { #[cfg_attr(test, assert_instr(vcvtsi2sd))] pub unsafe fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d { let b = b as f64; - let r = simd_insert(a, 0, b); - transmute(r) + simd_insert(a, 0, b) } /// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. @@ -78,8 +76,7 @@ pub unsafe fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d { #[cfg_attr(test, assert_instr(vcvtusi2ss))] pub unsafe fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 { let b = b as f32; - let r = simd_insert(a, 0, b); - transmute(r) + simd_insert(a, 0, b) } /// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. @@ -90,8 +87,7 @@ pub unsafe fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 { #[cfg_attr(test, assert_instr(vcvtusi2sd))] pub unsafe fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d { let b = b as f64; - let r = simd_insert(a, 0, b); - transmute(r) + simd_insert(a, 0, b) } /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst. @@ -101,7 +97,7 @@ pub unsafe fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vcvtsd2si))] pub unsafe fn _mm_cvttsd_i64(a: __m128d) -> i64 { - transmute(vcvtsd2si64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)) + vcvtsd2si64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) } /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst. @@ -111,7 +107,7 @@ pub unsafe fn _mm_cvttsd_i64(a: __m128d) -> i64 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vcvtsd2usi))] pub unsafe fn _mm_cvttsd_u64(a: __m128d) -> u64 { - transmute(vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)) + vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) } /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst. @@ -121,7 +117,7 @@ pub unsafe fn _mm_cvttsd_u64(a: __m128d) -> u64 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vcvtss2si))] pub unsafe fn _mm_cvttss_i64(a: __m128) -> i64 { - transmute(vcvtss2si64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)) + vcvtss2si64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) } /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst. @@ -131,7 +127,7 @@ pub unsafe fn _mm_cvttss_i64(a: __m128) -> i64 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vcvtss2usi))] pub unsafe fn _mm_cvttss_u64(a: __m128) -> u64 { - transmute(vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)) + vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) } /// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. @@ -270,8 +266,7 @@ pub unsafe fn _mm_cvt_roundu64_ss(a: __m128, b: u64) -> __m pub unsafe fn _mm_cvt_roundsd_si64(a: __m128d) -> i64 { static_assert_rounding!(ROUNDING); let a = a.as_f64x2(); - let r = vcvtsd2si64(a, ROUNDING); - transmute(r) + vcvtsd2si64(a, ROUNDING) } /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\ @@ -290,8 +285,7 @@ pub unsafe fn _mm_cvt_roundsd_si64(a: __m128d) -> i64 { pub unsafe fn _mm_cvt_roundsd_i64(a: __m128d) -> i64 { static_assert_rounding!(ROUNDING); let a = a.as_f64x2(); - let r = vcvtsd2si64(a, ROUNDING); - transmute(r) + vcvtsd2si64(a, ROUNDING) } /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\ @@ -310,8 +304,7 @@ pub unsafe fn _mm_cvt_roundsd_i64(a: __m128d) -> i64 { pub unsafe fn _mm_cvt_roundsd_u64(a: __m128d) -> u64 { static_assert_rounding!(ROUNDING); let a = a.as_f64x2(); - let r = vcvtsd2usi64(a, ROUNDING); - transmute(r) + vcvtsd2usi64(a, ROUNDING) } /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\ @@ -330,8 +323,7 @@ pub unsafe fn _mm_cvt_roundsd_u64(a: __m128d) -> u64 { pub unsafe fn _mm_cvt_roundss_si64(a: __m128) -> i64 { static_assert_rounding!(ROUNDING); let a = a.as_f32x4(); - let r = vcvtss2si64(a, ROUNDING); - transmute(r) + vcvtss2si64(a, ROUNDING) } /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\ @@ -350,8 +342,7 @@ pub unsafe fn _mm_cvt_roundss_si64(a: __m128) -> i64 { pub unsafe fn _mm_cvt_roundss_i64(a: __m128) -> i64 { static_assert_rounding!(ROUNDING); let a = a.as_f32x4(); - let r = vcvtss2si64(a, ROUNDING); - transmute(r) + vcvtss2si64(a, ROUNDING) } /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\ @@ -370,8 +361,7 @@ pub unsafe fn _mm_cvt_roundss_i64(a: __m128) -> i64 { pub unsafe fn _mm_cvt_roundss_u64(a: __m128) -> u64 { static_assert_rounding!(ROUNDING); let a = a.as_f32x4(); - let r = vcvtss2usi64(a, ROUNDING); - transmute(r) + vcvtss2usi64(a, ROUNDING) } /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\ @@ -385,8 +375,7 @@ pub unsafe fn _mm_cvt_roundss_u64(a: __m128) -> u64 { pub unsafe fn _mm_cvtt_roundsd_si64(a: __m128d) -> i64 { static_assert_sae!(SAE); let a = a.as_f64x2(); - let r = vcvtsd2si64(a, SAE); - transmute(r) + vcvtsd2si64(a, SAE) } /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\ @@ -400,8 +389,7 @@ pub unsafe fn _mm_cvtt_roundsd_si64(a: __m128d) -> i64 { pub unsafe fn _mm_cvtt_roundsd_i64(a: __m128d) -> i64 { static_assert_sae!(SAE); let a = a.as_f64x2(); - let r = vcvtsd2si64(a, SAE); - transmute(r) + vcvtsd2si64(a, SAE) } /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\ @@ -415,8 +403,7 @@ pub unsafe fn _mm_cvtt_roundsd_i64(a: __m128d) -> i64 { pub unsafe fn _mm_cvtt_roundsd_u64(a: __m128d) -> u64 { static_assert_sae!(SAE); let a = a.as_f64x2(); - let r = vcvtsd2usi64(a, SAE); - transmute(r) + vcvtsd2usi64(a, SAE) } /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\ @@ -430,8 +417,7 @@ pub unsafe fn _mm_cvtt_roundsd_u64(a: __m128d) -> u64 { pub unsafe fn _mm_cvtt_roundss_i64(a: __m128) -> i64 { static_assert_sae!(SAE); let a = a.as_f32x4(); - let r = vcvtss2si64(a, SAE); - transmute(r) + vcvtss2si64(a, SAE) } /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\ @@ -445,8 +431,7 @@ pub unsafe fn _mm_cvtt_roundss_i64(a: __m128) -> i64 { pub unsafe fn _mm_cvtt_roundss_si64(a: __m128) -> i64 { static_assert_sae!(SAE); let a = a.as_f32x4(); - let r = vcvtss2si64(a, SAE); - transmute(r) + vcvtss2si64(a, SAE) } /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\ @@ -460,8 +445,7 @@ pub unsafe fn _mm_cvtt_roundss_si64(a: __m128) -> i64 { pub unsafe fn _mm_cvtt_roundss_u64(a: __m128) -> u64 { static_assert_sae!(SAE); let a = a.as_f32x4(); - let r = vcvtss2usi64(a, SAE); - transmute(r) + vcvtss2usi64(a, SAE) } #[allow(improper_ctypes)] diff --git a/library/stdarch/crates/core_arch/src/x86_64/sse2.rs b/library/stdarch/crates/core_arch/src/x86_64/sse2.rs index bf2394eba..9619cb748 100644 --- a/library/stdarch/crates/core_arch/src/x86_64/sse2.rs +++ b/library/stdarch/crates/core_arch/src/x86_64/sse2.rs @@ -181,6 +181,9 @@ mod tests { } #[simd_test(enable = "sse2")] + // Miri cannot support this until it is clear how it fits in the Rust memory model + // (non-temporal store) + #[cfg_attr(miri, ignore)] unsafe fn test_mm_stream_si64() { let a: i64 = 7; let mut mem = boxed::Box::::new(-1); diff --git a/library/stdarch/crates/intrinsic-test/Cargo.toml b/library/stdarch/crates/intrinsic-test/Cargo.toml index d977dd659..c7a18f77f 100644 --- a/library/stdarch/crates/intrinsic-test/Cargo.toml +++ b/library/stdarch/crates/intrinsic-test/Cargo.toml @@ -12,10 +12,10 @@ lazy_static = "1.4.0" serde = { version = "1", features = ["derive"] } serde_json = "1.0" csv = "1.1" -clap = "2.33.3" +clap = { version = "4.4", features = ["derive"] } regex = "1.4.2" log = "0.4.11" -pretty_env_logger = "0.4.0" +pretty_env_logger = "0.5.0" rayon = "1.5.0" diff = "0.1.12" -itertools = "0.10.1" +itertools = "0.11.0" diff --git a/library/stdarch/crates/intrinsic-test/README.md b/library/stdarch/crates/intrinsic-test/README.md index 8a8ddab40..2b3f0c75a 100644 --- a/library/stdarch/crates/intrinsic-test/README.md +++ b/library/stdarch/crates/intrinsic-test/README.md @@ -4,15 +4,17 @@ each produces the same result from random inputs. # Usage ``` USAGE: - intrinsic-test [OPTIONS] + intrinsic-test [FLAGS] [OPTIONS] FLAGS: + --a32 Run tests for A32 instrinsics instead of A64 -h, --help Prints help information -V, --version Prints version information OPTIONS: --cppcompiler The C++ compiler to use for compiling the c++ code [default: clang++] --runner Run the C programs under emulation with this command + --skip Filename for a list of intrinsics to skip (one per line) --toolchain The rust toolchain to use for building the rust code ARGS: diff --git a/library/stdarch/crates/intrinsic-test/src/json_parser.rs b/library/stdarch/crates/intrinsic-test/src/json_parser.rs index bc6fa4a9e..8b3c7869c 100644 --- a/library/stdarch/crates/intrinsic-test/src/json_parser.rs +++ b/library/stdarch/crates/intrinsic-test/src/json_parser.rs @@ -1,4 +1,5 @@ use std::collections::HashMap; +use std::path::Path; use serde::Deserialize; @@ -41,7 +42,7 @@ struct JsonIntrinsic { architectures: Vec, } -pub fn get_neon_intrinsics(filename: &str) -> Result, Box> { +pub fn get_neon_intrinsics(filename: &Path) -> Result, Box> { let file = std::fs::File::open(filename)?; let reader = std::io::BufReader::new(file); let json: Vec = serde_json::from_reader(reader).expect("Couldn't parse JSON"); diff --git a/library/stdarch/crates/intrinsic-test/src/main.rs b/library/stdarch/crates/intrinsic-test/src/main.rs index 76d2da3ab..15bc021c7 100644 --- a/library/stdarch/crates/intrinsic-test/src/main.rs +++ b/library/stdarch/crates/intrinsic-test/src/main.rs @@ -4,9 +4,9 @@ extern crate log; use std::fs::File; use std::io::Write; +use std::path::PathBuf; use std::process::Command; -use clap::{App, Arg}; use intrinsic::Intrinsic; use itertools::Itertools; use rayon::prelude::*; @@ -320,58 +320,47 @@ path = "{intrinsic}/main.rs""#, } } +/// Intrinsic test tool +#[derive(clap::Parser)] +#[command( + name = "Intrinsic test tool", + about = "Generates Rust and C programs for intrinsics and compares the output" +)] +struct Cli { + /// The input file containing the intrinsics + input: PathBuf, + + /// The rust toolchain to use for building the rust code + #[arg(long)] + toolchain: Option, + + /// The C++ compiler to use for compiling the c++ code + #[arg(long, default_value_t = String::from("clang++"))] + cppcompiler: String, + + /// Run the C programs under emulation with this command + #[arg(long)] + runner: Option, + + /// Filename for a list of intrinsics to skip (one per line) + #[arg(long)] + skip: Option, + + /// Run tests for A32 instrinsics instead of A64 + #[arg(long)] + a32: bool, +} + fn main() { pretty_env_logger::init(); - let matches = App::new("Intrinsic test tool") - .about("Generates Rust and C programs for intrinsics and compares the output") - .arg( - Arg::with_name("INPUT") - .help("The input file containing the intrinsics") - .required(true) - .index(1), - ) - .arg( - Arg::with_name("TOOLCHAIN") - .takes_value(true) - .long("toolchain") - .help("The rust toolchain to use for building the rust code"), - ) - .arg( - Arg::with_name("CPPCOMPILER") - .takes_value(true) - .default_value("clang++") - .long("cppcompiler") - .help("The C++ compiler to use for compiling the c++ code"), - ) - .arg( - Arg::with_name("RUNNER") - .takes_value(true) - .long("runner") - .help("Run the C programs under emulation with this command"), - ) - .arg( - Arg::with_name("SKIP") - .takes_value(true) - .long("skip") - .help("Filename for a list of intrinsics to skip (one per line)"), - ) - .arg( - Arg::with_name("A32") - .takes_value(false) - .long("a32") - .help("Run tests for A32 instrinsics instead of A64"), - ) - .get_matches(); - - let filename = matches.value_of("INPUT").unwrap(); - let toolchain = matches - .value_of("TOOLCHAIN") - .map_or("".into(), |t| format!("+{t}")); + let args: Cli = clap::Parser::parse(); - let cpp_compiler = matches.value_of("CPPCOMPILER").unwrap(); - let c_runner = matches.value_of("RUNNER").unwrap_or(""); - let skip = if let Some(filename) = matches.value_of("SKIP") { + let filename = args.input; + let toolchain = args.toolchain.map_or_else(String::new, |t| format!("+{t}")); + let cpp_compiler = args.cppcompiler; + let c_runner = args.runner.unwrap_or_else(String::new); + let skip = if let Some(filename) = args.skip { let data = std::fs::read_to_string(&filename).expect("Failed to open file"); data.lines() .map(str::trim) @@ -381,8 +370,8 @@ fn main() { } else { Default::default() }; - let a32 = matches.is_present("A32"); - let mut intrinsics = get_neon_intrinsics(filename).expect("Error parsing input file"); + let a32 = args.a32; + let mut intrinsics = get_neon_intrinsics(&filename).expect("Error parsing input file"); intrinsics.sort_by(|a, b| a.name.cmp(&b.name)); @@ -409,7 +398,7 @@ fn main() { let notices = build_notices("// "); - if !build_c(¬ices, &intrinsics, cpp_compiler, a32) { + if !build_c(¬ices, &intrinsics, &cpp_compiler, a32) { std::process::exit(2); } diff --git a/library/stdarch/crates/simd-test-macro/Cargo.toml b/library/stdarch/crates/simd-test-macro/Cargo.toml index cd110c1d3..c9e692d8e 100644 --- a/library/stdarch/crates/simd-test-macro/Cargo.toml +++ b/library/stdarch/crates/simd-test-macro/Cargo.toml @@ -11,3 +11,4 @@ test = false [dependencies] proc-macro2 = "1.0" quote = "1.0" +syn = { version = "2.0", features = ["full"] } diff --git a/library/stdarch/crates/simd-test-macro/src/lib.rs b/library/stdarch/crates/simd-test-macro/src/lib.rs index 2a31dd745..9e089f86b 100644 --- a/library/stdarch/crates/simd-test-macro/src/lib.rs +++ b/library/stdarch/crates/simd-test-macro/src/lib.rs @@ -7,7 +7,7 @@ #[macro_use] extern crate quote; -use proc_macro2::{Delimiter, Ident, Literal, Span, TokenStream, TokenTree}; +use proc_macro2::{Ident, Literal, Span, TokenStream, TokenTree}; use quote::ToTokens; use std::env; @@ -44,13 +44,9 @@ pub fn simd_test( .collect(); let enable_feature = string(enable_feature); - let item = TokenStream::from(item); - let name = find_name(item.clone()); - - let name: TokenStream = name - .to_string() - .parse() - .unwrap_or_else(|_| panic!("failed to parse name: {}", name.to_string())); + let mut item = syn::parse_macro_input!(item as syn::ItemFn); + let item_attrs = std::mem::take(&mut item.attrs); + let name = &item.sig.ident; let target = env::var("TARGET").expect( "TARGET environment variable should be set for rustc (e.g. TARGET=x86_64-apple-darwin cargo test)" @@ -109,6 +105,7 @@ pub fn simd_test( #[allow(non_snake_case)] #[test] #maybe_ignore + #(#item_attrs)* fn #name() { if #force_test | (#cfg_target_features) { let v = unsafe { #name() }; @@ -123,29 +120,3 @@ pub fn simd_test( }; ret.into() } - -fn find_name(item: TokenStream) -> Ident { - let mut tokens = item.into_iter(); - while let Some(tok) = tokens.next() { - if let TokenTree::Ident(word) = tok { - if word == "fn" { - break; - } - } - } - - fn get_ident(tt: TokenTree) -> Option { - match tt { - TokenTree::Ident(i) => Some(i), - TokenTree::Group(g) if g.delimiter() == Delimiter::None => { - get_ident(g.stream().into_iter().next()?) - } - _ => None, - } - } - - tokens - .next() - .and_then(get_ident) - .expect("failed to find function name") -} diff --git a/library/stdarch/crates/std_detect/Cargo.toml b/library/stdarch/crates/std_detect/Cargo.toml index 589a3900a..12d4a658c 100644 --- a/library/stdarch/crates/std_detect/Cargo.toml +++ b/library/stdarch/crates/std_detect/Cargo.toml @@ -30,7 +30,6 @@ compiler_builtins = { version = "0.1.2", optional = true } alloc = { version = "1.0.0", optional = true, package = "rustc-std-workspace-alloc" } [dev-dependencies] -auxv = "0.3.3" cupid = "0.6.0" [features] diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs index 8bc0b30c3..ee46aa1ac 100644 --- a/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs +++ b/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs @@ -19,6 +19,7 @@ pub(crate) const AT_HWCAP2: usize = 26; /// If an entry cannot be read all the bits in the bitfield are set to zero. /// This should be interpreted as all the features being disabled. #[derive(Debug, Copy, Clone)] +#[cfg_attr(test, derive(PartialEq))] pub(crate) struct AuxVec { pub hwcap: usize, #[cfg(any( @@ -174,9 +175,12 @@ pub(crate) fn auxv() -> Result { /// Tries to read the `key` from the auxiliary vector by calling the /// dynamically-linked `getauxval` function. If the function is not linked, /// this function return `Err`. -#[cfg(all( - feature = "std_detect_dlsym_getauxval", - not(all(target_os = "linux", target_env = "gnu")) +#[cfg(any( + test, + all( + feature = "std_detect_dlsym_getauxval", + not(all(target_os = "linux", target_env = "gnu")) + ) ))] fn getauxval(key: usize) -> Result { use libc; @@ -262,35 +266,8 @@ fn auxv_from_buf(buf: &[usize]) -> Result { #[cfg(test)] mod tests { - extern crate auxv as auxv_crate; use super::*; - // Reads the Auxiliary Vector key from /proc/self/auxv - // using the auxv crate. - #[cfg(feature = "std_detect_file_io")] - fn auxv_crate_getprocfs(key: usize) -> Option { - use self::auxv_crate::procfs::search_procfs_auxv; - use self::auxv_crate::AuxvType; - let k = key as AuxvType; - match search_procfs_auxv(&[k]) { - Ok(v) => Some(v[&k] as usize), - Err(_) => None, - } - } - - // Reads the Auxiliary Vector key from getauxval() - // using the auxv crate. - #[cfg(not(any(target_arch = "mips", target_arch = "mips64")))] - fn auxv_crate_getauxval(key: usize) -> Option { - use self::auxv_crate::getauxval::Getauxval; - use self::auxv_crate::AuxvType; - let q = auxv_crate::getauxval::NativeGetauxval {}; - match q.getauxval(key as AuxvType) { - Ok(v) => Some(v as usize), - Err(_) => None, - } - } - // FIXME: on mips/mips64 getauxval returns 0, and /proc/self/auxv // does not always contain the AT_HWCAP key under qemu. #[cfg(any( @@ -301,7 +278,7 @@ mod tests { #[test] fn auxv_crate() { let v = auxv(); - if let Some(hwcap) = auxv_crate_getauxval(AT_HWCAP) { + if let Ok(hwcap) = getauxval(AT_HWCAP) { let rt_hwcap = v.expect("failed to find hwcap key").hwcap; assert_eq!(rt_hwcap, hwcap); } @@ -314,7 +291,7 @@ mod tests { target_arch = "powerpc64" ))] { - if let Some(hwcap2) = auxv_crate_getauxval(AT_HWCAP2) { + if let Ok(hwcap2) = getauxval(AT_HWCAP2) { let rt_hwcap2 = v.expect("failed to find hwcap2 key").hwcap2; assert_eq!(rt_hwcap2, hwcap2); } @@ -391,22 +368,8 @@ mod tests { #[test] #[cfg(feature = "std_detect_file_io")] fn auxv_crate_procfs() { - let v = auxv(); - if let Some(hwcap) = auxv_crate_getprocfs(AT_HWCAP) { - assert_eq!(v.unwrap().hwcap, hwcap); - } - - // Targets with AT_HWCAP and AT_HWCAP2: - #[cfg(any( - target_arch = "aarch64", - target_arch = "arm", - target_arch = "powerpc", - target_arch = "powerpc64" - ))] - { - if let Some(hwcap2) = auxv_crate_getprocfs(AT_HWCAP2) { - assert_eq!(v.unwrap().hwcap2, hwcap2); - } + if let Ok(procfs_auxv) = auxv_from_file("/proc/self/auxv") { + assert_eq!(auxv().unwrap(), procfs_auxv); } } } diff --git a/library/stdarch/crates/std_detect/src/detect/os/x86.rs b/library/stdarch/crates/std_detect/src/detect/os/x86.rs index d8afc1aca..d8dd84db4 100644 --- a/library/stdarch/crates/std_detect/src/detect/os/x86.rs +++ b/library/stdarch/crates/std_detect/src/detect/os/x86.rs @@ -49,11 +49,7 @@ pub(crate) fn detect_features() -> cache::Initializer { ecx, edx, } = __cpuid(0); - let vendor_id: [[u8; 4]; 3] = [ - mem::transmute(ebx), - mem::transmute(edx), - mem::transmute(ecx), - ]; + let vendor_id: [[u8; 4]; 3] = [ebx.to_ne_bytes(), edx.to_ne_bytes(), ecx.to_ne_bytes()]; let vendor_id: [u8; 12] = mem::transmute(vendor_id); (max_basic_leaf, vendor_id) }; diff --git a/library/stdarch/crates/stdarch-test/Cargo.toml b/library/stdarch/crates/stdarch-test/Cargo.toml index 3a2130d4e..3682fcd7e 100644 --- a/library/stdarch/crates/stdarch-test/Cargo.toml +++ b/library/stdarch/crates/stdarch-test/Cargo.toml @@ -20,7 +20,7 @@ cc = "1.0" # time, and we want to make updates to this explicit rather than automatically # picking up updates which might break CI with new instruction names. [target.'cfg(target_arch = "wasm32")'.dependencies] -wasmprinter = "=0.2.53" +wasmprinter = "=0.2.67" [features] default = [] diff --git a/library/stdarch/crates/stdarch-test/src/disassembly.rs b/library/stdarch/crates/stdarch-test/src/disassembly.rs index 54df7261e..087fc46d4 100644 --- a/library/stdarch/crates/stdarch-test/src/disassembly.rs +++ b/library/stdarch/crates/stdarch-test/src/disassembly.rs @@ -81,6 +81,8 @@ pub(crate) fn disassemble_myself() -> HashSet { let add_args = if cfg!(target_os = "macos") && cfg!(target_arch = "aarch64") { // Target features need to be enabled for LLVM objdump on Macos ARM64 vec!["--mattr=+v8.6a,+crypto,+tme"] + } else if cfg!(target_arch = "riscv64") { + vec!["--mattr=+zk,+zks,+zbc,+zbb"] } else { vec![] }; diff --git a/library/stdarch/crates/stdarch-verify/Cargo.toml b/library/stdarch/crates/stdarch-verify/Cargo.toml index 10ae90074..515f05138 100644 --- a/library/stdarch/crates/stdarch-verify/Cargo.toml +++ b/library/stdarch/crates/stdarch-verify/Cargo.toml @@ -7,7 +7,7 @@ edition = "2021" [dependencies] proc-macro2 = "1.0" quote = "1.0" -syn = { version = "1.0", features = ["full"] } +syn = { version = "2.0", features = ["full"] } [lib] proc-macro = true @@ -15,5 +15,5 @@ test = false [dev-dependencies] serde = { version = "1.0", features = ['derive'] } -serde-xml-rs = "0.3" +serde-xml-rs = "0.6" serde_json = "1.0.96" diff --git a/library/stdarch/crates/stdarch-verify/src/lib.rs b/library/stdarch/crates/stdarch-verify/src/lib.rs index a9bf89f70..3f9eb3bf9 100644 --- a/library/stdarch/crates/stdarch-verify/src/lib.rs +++ b/library/stdarch/crates/stdarch-verify/src/lib.rs @@ -7,6 +7,7 @@ extern crate syn; use proc_macro::TokenStream; use std::{fs::File, io::Read, path::Path}; use syn::ext::IdentExt; +use syn::parse::Parser as _; #[proc_macro] pub fn x86_functions(input: TokenStream) -> TokenStream { @@ -416,7 +417,7 @@ fn walk(root: &Path, files: &mut Vec<(syn::File, String)>) { fn find_instrs(attrs: &[syn::Attribute]) -> Vec { struct AssertInstr { - instr: String, + instr: Option, } // A small custom parser to parse out the instruction in `assert_instr`. @@ -424,15 +425,21 @@ fn find_instrs(attrs: &[syn::Attribute]) -> Vec { // TODO: should probably just reuse `Invoc` from the `assert-instr-macro` // crate. impl syn::parse::Parse for AssertInstr { - fn parse(content: syn::parse::ParseStream<'_>) -> syn::Result { - let input; - parenthesized!(input in content); - let _ = input.parse::()?; - let _ = input.parse::()?; - let ident = input.parse::()?; - if ident != "assert_instr" { - return Err(input.error("expected `assert_instr`")); + fn parse(input: syn::parse::ParseStream<'_>) -> syn::Result { + let _ = input.parse::().unwrap(); + let _ = input.parse::().unwrap(); + + match input.parse::() { + Ok(ident) if ident == "assert_instr" => {} + _ => { + while !input.is_empty() { + // consume everything + drop(input.parse::()); + } + return Ok(Self { instr: None }); + } } + let instrs; parenthesized!(instrs in input); @@ -452,18 +459,24 @@ fn find_instrs(attrs: &[syn::Attribute]) -> Vec { return Err(input.error("failed to parse instruction")); } } - Ok(Self { instr }) + Ok(Self { instr: Some(instr) }) } } attrs .iter() - .filter(|a| a.path.is_ident("cfg_attr")) .filter_map(|a| { - syn::parse2::(a.tokens.clone()) - .ok() - .map(|a| a.instr) + if let syn::Meta::List(ref l) = a.meta { + if l.path.is_ident("cfg_attr") { + Some(l) + } else { + None + } + } else { + None + } }) + .filter_map(|l| syn::parse2::(l.tokens.clone()).unwrap().instr) .collect() } @@ -471,19 +484,26 @@ fn find_target_feature(attrs: &[syn::Attribute]) -> Option { attrs .iter() .flat_map(|a| { - if let Ok(syn::Meta::List(i)) = a.parse_meta() { - if i.path.is_ident("target_feature") { - return i.nested; + if let syn::Meta::List(ref l) = a.meta { + if l.path.is_ident("target_feature") { + if let Ok(l) = + syn::punctuated::Punctuated::::parse_terminated + .parse2(l.tokens.clone()) + { + return l; + } } } syn::punctuated::Punctuated::new() }) - .filter_map(|nested| match nested { - syn::NestedMeta::Meta(m) => Some(m), - syn::NestedMeta::Lit(_) => None, - }) .find_map(|m| match m { - syn::Meta::NameValue(ref i) if i.path.is_ident("enable") => Some(i.clone().lit), + syn::Meta::NameValue(i) if i.path.is_ident("enable") => { + if let syn::Expr::Lit(lit) = i.value { + Some(lit.lit) + } else { + None + } + } _ => None, }) } @@ -491,9 +511,16 @@ fn find_target_feature(attrs: &[syn::Attribute]) -> Option { fn find_required_const(name: &str, attrs: &[syn::Attribute]) -> Vec { attrs .iter() - .flat_map(|a| { - if a.path.segments[0].ident == name { - syn::parse::(a.tokens.clone().into()) + .filter_map(|a| { + if let syn::Meta::List(ref l) = a.meta { + Some(l) + } else { + None + } + }) + .flat_map(|l| { + if l.path.segments[0].ident == name { + syn::parse2::(l.tokens.clone()) .unwrap() .args } else { @@ -509,10 +536,7 @@ struct RustcArgsRequiredConst { impl syn::parse::Parse for RustcArgsRequiredConst { fn parse(input: syn::parse::ParseStream<'_>) -> syn::Result { - let content; - parenthesized!(content in input); - let list = - syn::punctuated::Punctuated::::parse_terminated(&content)?; + let list = syn::punctuated::Punctuated::::parse_terminated(&input)?; Ok(Self { args: list .into_iter() -- cgit v1.2.3