summaryrefslogtreecommitdiffstats
path: root/library/stdarch
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-07 05:48:48 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-07 05:48:48 +0000
commitef24de24a82fe681581cc130f342363c47c0969a (patch)
tree0d494f7e1a38b95c92426f58fe6eaa877303a86c /library/stdarch
parentReleasing progress-linux version 1.74.1+dfsg1-1~progress7.99u1. (diff)
downloadrustc-ef24de24a82fe681581cc130f342363c47c0969a.tar.xz
rustc-ef24de24a82fe681581cc130f342363c47c0969a.zip
Merging upstream version 1.75.0+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'library/stdarch')
-rw-r--r--library/stdarch/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile10
-rwxr-xr-xlibrary/stdarch/ci/run.sh7
-rw-r--r--library/stdarch/crates/assert-instr-macro/Cargo.toml2
-rw-r--r--library/stdarch/crates/assert-instr-macro/src/lib.rs13
-rw-r--r--library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs9
-rw-r--r--library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs4
-rw-r--r--library/stdarch/crates/core_arch/src/riscv64/zk.rs57
-rw-r--r--library/stdarch/crates/core_arch/src/riscv_shared/zb.rs12
-rw-r--r--library/stdarch/crates/core_arch/src/riscv_shared/zk.rs30
-rw-r--r--library/stdarch/crates/core_arch/src/x86/avx.rs40
-rw-r--r--library/stdarch/crates/core_arch/src/x86/avx2.rs46
-rw-r--r--library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs12
-rw-r--r--library/stdarch/crates/core_arch/src/x86/avx512bw.rs104
-rw-r--r--library/stdarch/crates/core_arch/src/x86/avx512f.rs333
-rw-r--r--library/stdarch/crates/core_arch/src/x86/sse.rs186
-rw-r--r--library/stdarch/crates/core_arch/src/x86/sse2.rs309
-rw-r--r--library/stdarch/crates/core_arch/src/x86/sse3.rs18
-rw-r--r--library/stdarch/crates/core_arch/src/x86/sse41.rs72
-rw-r--r--library/stdarch/crates/core_arch/src/x86/test.rs9
-rw-r--r--library/stdarch/crates/core_arch/src/x86_64/avx512f.rs60
-rw-r--r--library/stdarch/crates/core_arch/src/x86_64/sse2.rs3
-rw-r--r--library/stdarch/crates/intrinsic-test/Cargo.toml6
-rw-r--r--library/stdarch/crates/intrinsic-test/README.md4
-rw-r--r--library/stdarch/crates/intrinsic-test/src/json_parser.rs3
-rw-r--r--library/stdarch/crates/intrinsic-test/src/main.rs93
-rw-r--r--library/stdarch/crates/simd-test-macro/Cargo.toml1
-rw-r--r--library/stdarch/crates/simd-test-macro/src/lib.rs39
-rw-r--r--library/stdarch/crates/std_detect/Cargo.toml1
-rw-r--r--library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs59
-rw-r--r--library/stdarch/crates/std_detect/src/detect/os/x86.rs6
-rw-r--r--library/stdarch/crates/stdarch-test/Cargo.toml2
-rw-r--r--library/stdarch/crates/stdarch-test/src/disassembly.rs2
-rw-r--r--library/stdarch/crates/stdarch-verify/Cargo.toml4
-rw-r--r--library/stdarch/crates/stdarch-verify/src/lib.rs82
-rw-r--r--library/stdarch/examples/Cargo.toml4
35 files changed, 858 insertions, 784 deletions
diff --git a/library/stdarch/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile b/library/stdarch/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
index 7ea795cac..af26b2682 100644
--- a/library/stdarch/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
+++ b/library/stdarch/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
@@ -2,8 +2,12 @@ FROM ubuntu:23.04
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc libc6-dev qemu-user ca-certificates \
- gcc-riscv64-linux-gnu libc6-dev-riscv64-cross
+ gcc-riscv64-linux-gnu libc6-dev-riscv64-cross \
+ llvm
ENV CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_LINKER=riscv64-linux-gnu-gcc \
- CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_RUNNER="qemu-riscv64 -L /usr/riscv64-linux-gnu -cpu rv64,zk=true,zbb=true,zbc=true" \
- OBJDUMP=riscv64-linux-gnu-objdump
+ CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_RUNNER="qemu-riscv64 \
+ -L /usr/riscv64-linux-gnu \
+ -cpu rv64,zk=true,zks=true,zbb=true,zbc=true \
+ " \
+ OBJDUMP=llvm-objdump
diff --git a/library/stdarch/ci/run.sh b/library/stdarch/ci/run.sh
index 7b2416fda..20cd4c564 100755
--- a/library/stdarch/ci/run.sh
+++ b/library/stdarch/ci/run.sh
@@ -33,6 +33,11 @@ case ${TARGET} in
i686-* | i586-*)
export RUSTFLAGS="${RUSTFLAGS} -C relocation-model=static -Z plt=yes"
;;
+ # Some x86_64 targets enable by default more features beyond SSE2,
+ # which cause some instruction assertion checks to fail.
+ x86_64-*)
+ export RUSTFLAGS="${RUSTFLAGS} -C target-feature=-sse3"
+ ;;
#Unoptimized build uses fast-isel which breaks with msa
mips-* | mipsel-*)
export RUSTFLAGS="${RUSTFLAGS} -C llvm-args=-fast-isel=false"
@@ -47,7 +52,7 @@ case ${TARGET} in
# Some of our test dependencies use the deprecated `gcc` crates which
# doesn't detect RISC-V compilers automatically, so do it manually here.
riscv64*)
- export RUSTFLAGS="${RUSTFLAGS} -Ctarget-feature=+zk,+zbb,+zbc"
+ export RUSTFLAGS="${RUSTFLAGS} -Ctarget-feature=+zk,+zks,+zbb,+zbc"
export TARGET_CC="riscv64-linux-gnu-gcc"
;;
esac
diff --git a/library/stdarch/crates/assert-instr-macro/Cargo.toml b/library/stdarch/crates/assert-instr-macro/Cargo.toml
index 4ad654e69..881c8109c 100644
--- a/library/stdarch/crates/assert-instr-macro/Cargo.toml
+++ b/library/stdarch/crates/assert-instr-macro/Cargo.toml
@@ -11,4 +11,4 @@ test = false
[dependencies]
proc-macro2 = "1.0"
quote = "1.0"
-syn = { version = "1.0", features = ["full"] }
+syn = { version = "2.0", features = ["full"] }
diff --git a/library/stdarch/crates/assert-instr-macro/src/lib.rs b/library/stdarch/crates/assert-instr-macro/src/lib.rs
index 99e37c910..c9de43943 100644
--- a/library/stdarch/crates/assert-instr-macro/src/lib.rs
+++ b/library/stdarch/crates/assert-instr-macro/src/lib.rs
@@ -35,6 +35,15 @@ pub fn assert_instr(
let instr = &invoc.instr;
let name = &func.sig.ident;
+ let maybe_allow_deprecated = if func
+ .attrs
+ .iter()
+ .any(|attr| attr.path().is_ident("deprecated"))
+ {
+ quote! { #[allow(deprecated)] }
+ } else {
+ quote! {}
+ };
// Disable assert_instr for x86 targets compiled with avx enabled, which
// causes LLVM to generate different intrinsics that the ones we are
@@ -108,7 +117,7 @@ pub fn assert_instr(
.attrs
.iter()
.filter(|attr| {
- attr.path
+ attr.path()
.segments
.first()
.expect("attr.path.segments.first() failed")
@@ -135,6 +144,7 @@ pub fn assert_instr(
let to_test = if disable_dedup_guard {
quote! {
#attrs
+ #maybe_allow_deprecated
#[no_mangle]
#[inline(never)]
pub unsafe extern #abi fn #shim_name(#(#inputs),*) #ret {
@@ -147,6 +157,7 @@ pub fn assert_instr(
const #shim_name_ptr : *const u8 = #shim_name_str.as_ptr();
#attrs
+ #maybe_allow_deprecated
#[no_mangle]
#[inline(never)]
pub unsafe extern #abi fn #shim_name(#(#inputs),*) #ret {
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs b/library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs
index 6faae0fee..fe540a7d8 100644
--- a/library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs
@@ -11,7 +11,8 @@ impl super::super::sealed::Dmb for SY {
#[inline(always)]
unsafe fn __dmb(&self) {
asm!(
- "mcr p15, 0, r0, c7, c10, 5",
+ "mcr p15, 0, {}, c7, c10, 5",
+ in(reg) 0_u32,
options(preserves_flags, nostack)
)
}
@@ -21,7 +22,8 @@ impl super::super::sealed::Dsb for SY {
#[inline(always)]
unsafe fn __dsb(&self) {
asm!(
- "mcr p15, 0, r0, c7, c10, 4",
+ "mcr p15, 0, {}, c7, c10, 4",
+ in(reg) 0_u32,
options(preserves_flags, nostack)
)
}
@@ -31,7 +33,8 @@ impl super::super::sealed::Isb for SY {
#[inline(always)]
unsafe fn __isb(&self) {
asm!(
- "mcr p15, 0, r0, c7, c5, 4",
+ "mcr p15, 0, {}, c7, c5, 4",
+ in(reg) 0_u32,
options(preserves_flags, nostack)
)
}
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs
index ebb8b7b9e..54bffa450 100644
--- a/library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs
@@ -22,7 +22,7 @@ macro_rules! test_vsli {
let a = [$($a as $t),*];
let b = [$($b as $t),*];
let n_bit_mask: $t = (1 << $n) - 1;
- let e = [$(($a as $t & n_bit_mask) | ($b as $t << $n)),*];
+ let e = [$(($a as $t & n_bit_mask) | (($b as $t) << $n)),*];
let r = $fn_id::<$n>(transmute(a), transmute(b));
let mut d = e;
d = transmute(r);
@@ -60,7 +60,7 @@ macro_rules! test_vsri {
unsafe fn $test_id() {
let a = [$($a as $t),*];
let b = [$($b as $t),*];
- let n_bit_mask = ((1 as $t << $n) - 1).rotate_right($n);
+ let n_bit_mask = (((1 as $t) << $n) - 1).rotate_right($n);
let e = [$(($a as $t & n_bit_mask) | (($b as $t >> $n) & !n_bit_mask)),*];
let r = $fn_id::<$n>(transmute(a), transmute(b));
let mut d = e;
diff --git a/library/stdarch/crates/core_arch/src/riscv64/zk.rs b/library/stdarch/crates/core_arch/src/riscv64/zk.rs
index 3dbe3705d..9b403fc95 100644
--- a/library/stdarch/crates/core_arch/src/riscv64/zk.rs
+++ b/library/stdarch/crates/core_arch/src/riscv64/zk.rs
@@ -20,6 +20,9 @@ extern "unadjusted" {
#[link_name = "llvm.riscv.aes64ks2"]
fn _aes64ks2(rs1: i64, rs2: i64) -> i64;
+ #[link_name = "llvm.riscv.aes64im"]
+ fn _aes64im(rs1: i64) -> i64;
+
#[link_name = "llvm.riscv.sha512sig0"]
fn _sha512sig0(rs1: i64) -> i64;
@@ -50,8 +53,7 @@ extern "unadjusted" {
///
/// This function is safe to use if the `zkne` target feature is present.
#[target_feature(enable = "zkne")]
-// See #1464
-// #[cfg_attr(test, assert_instr(aes64es))]
+#[cfg_attr(test, assert_instr(aes64es))]
#[inline]
pub unsafe fn aes64es(rs1: u64, rs2: u64) -> u64 {
_aes64es(rs1 as i64, rs2 as i64) as u64
@@ -74,8 +76,7 @@ pub unsafe fn aes64es(rs1: u64, rs2: u64) -> u64 {
///
/// This function is safe to use if the `zkne` target feature is present.
#[target_feature(enable = "zkne")]
-// See #1464
-// #[cfg_attr(test, assert_instr(aes64esm))]
+#[cfg_attr(test, assert_instr(aes64esm))]
#[inline]
pub unsafe fn aes64esm(rs1: u64, rs2: u64) -> u64 {
_aes64esm(rs1 as i64, rs2 as i64) as u64
@@ -98,8 +99,7 @@ pub unsafe fn aes64esm(rs1: u64, rs2: u64) -> u64 {
///
/// This function is safe to use if the `zknd` target feature is present.
#[target_feature(enable = "zknd")]
-// See #1464
-// #[cfg_attr(test, assert_instr(aes64ds))]
+#[cfg_attr(test, assert_instr(aes64ds))]
#[inline]
pub unsafe fn aes64ds(rs1: u64, rs2: u64) -> u64 {
_aes64ds(rs1 as i64, rs2 as i64) as u64
@@ -122,8 +122,7 @@ pub unsafe fn aes64ds(rs1: u64, rs2: u64) -> u64 {
///
/// This function is safe to use if the `zknd` target feature is present.
#[target_feature(enable = "zknd")]
-// See #1464
-// #[cfg_attr(test, assert_instr(aes64dsm))]
+#[cfg_attr(test, assert_instr(aes64dsm))]
#[inline]
pub unsafe fn aes64dsm(rs1: u64, rs2: u64) -> u64 {
_aes64dsm(rs1 as i64, rs2 as i64) as u64
@@ -152,8 +151,7 @@ pub unsafe fn aes64dsm(rs1: u64, rs2: u64) -> u64 {
/// This function is safe to use if the `zkne` or `zknd` target feature is present.
#[target_feature(enable = "zkne", enable = "zknd")]
#[rustc_legacy_const_generics(1)]
-// See #1464
-// #[cfg_attr(test, assert_instr(aes64ks1i, RNUM = 0))]
+#[cfg_attr(test, assert_instr(aes64ks1i, RNUM = 0))]
#[inline]
pub unsafe fn aes64ks1i<const RNUM: u8>(rs1: u64) -> u64 {
static_assert!(RNUM <= 10);
@@ -177,13 +175,36 @@ pub unsafe fn aes64ks1i<const RNUM: u8>(rs1: u64) -> u64 {
///
/// This function is safe to use if the `zkne` or `zknd` target feature is present.
#[target_feature(enable = "zkne", enable = "zknd")]
-// See #1464
-// #[cfg_attr(test, assert_instr(aes64ks2))]
+#[cfg_attr(test, assert_instr(aes64ks2))]
#[inline]
pub unsafe fn aes64ks2(rs1: u64, rs2: u64) -> u64 {
_aes64ks2(rs1 as i64, rs2 as i64) as u64
}
+/// This instruction accelerates the inverse MixColumns step of the AES Block Cipher, and is used to aid creation of
+/// the decryption KeySchedule.
+///
+/// The instruction applies the inverse MixColumns transformation to two columns of the state array, packed
+/// into a single 64-bit register. It is used to create the inverse cipher KeySchedule, according to the equivalent
+/// inverse cipher construction in (Page 23, Section 5.3.5). This instruction must always be implemented
+/// such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.9
+///
+/// # Safety
+///
+/// This function is safe to use if the `zkne` or `zknd` target feature is present.
+#[target_feature(enable = "zkne", enable = "zknd")]
+#[cfg_attr(test, assert_instr(aes64im))]
+#[inline]
+pub unsafe fn aes64im(rs1: u64) -> u64 {
+ _aes64im(rs1 as i64) as u64
+}
+
/// Implements the Sigma0 transformation function as used in the SHA2-512 hash function \[49\]
/// (Section 4.1.3).
///
@@ -201,8 +222,7 @@ pub unsafe fn aes64ks2(rs1: u64, rs2: u64) -> u64 {
///
/// This function is safe to use if the `zknh` target feature is present.
#[target_feature(enable = "zknh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sha512sig0))]
+#[cfg_attr(test, assert_instr(sha512sig0))]
#[inline]
pub unsafe fn sha512sig0(rs1: u64) -> u64 {
_sha512sig0(rs1 as i64) as u64
@@ -225,8 +245,7 @@ pub unsafe fn sha512sig0(rs1: u64) -> u64 {
///
/// This function is safe to use if the `zknh` target feature is present.
#[target_feature(enable = "zknh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sha512sig1))]
+#[cfg_attr(test, assert_instr(sha512sig1))]
#[inline]
pub unsafe fn sha512sig1(rs1: u64) -> u64 {
_sha512sig1(rs1 as i64) as u64
@@ -249,8 +268,7 @@ pub unsafe fn sha512sig1(rs1: u64) -> u64 {
///
/// This function is safe to use if the `zknh` target feature is present.
#[target_feature(enable = "zknh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sha512sum0))]
+#[cfg_attr(test, assert_instr(sha512sum0))]
#[inline]
pub unsafe fn sha512sum0(rs1: u64) -> u64 {
_sha512sum0(rs1 as i64) as u64
@@ -273,8 +291,7 @@ pub unsafe fn sha512sum0(rs1: u64) -> u64 {
///
/// This function is safe to use if the `zknh` target feature is present.
#[target_feature(enable = "zknh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sha512sum1))]
+#[cfg_attr(test, assert_instr(sha512sum1))]
#[inline]
pub unsafe fn sha512sum1(rs1: u64) -> u64 {
_sha512sum1(rs1 as i64) as u64
diff --git a/library/stdarch/crates/core_arch/src/riscv_shared/zb.rs b/library/stdarch/crates/core_arch/src/riscv_shared/zb.rs
index cfae6caa5..6785c04fd 100644
--- a/library/stdarch/crates/core_arch/src/riscv_shared/zb.rs
+++ b/library/stdarch/crates/core_arch/src/riscv_shared/zb.rs
@@ -47,8 +47,7 @@ extern "unadjusted" {
///
/// This function is safe to use if the `zbb` target feature is present.
#[target_feature(enable = "zbb")]
-// See #1464
-// #[cfg_attr(test, assert_instr(orc.b))]
+#[cfg_attr(test, assert_instr(orc.b))]
#[inline]
pub unsafe fn orc_b(rs: usize) -> usize {
#[cfg(target_arch = "riscv32")]
@@ -76,8 +75,7 @@ pub unsafe fn orc_b(rs: usize) -> usize {
///
/// This function is safe to use if the `zbc` target feature is present.
#[target_feature(enable = "zbc")]
-// See #1464
-// #[cfg_attr(test, assert_instr(clmul))]
+#[cfg_attr(test, assert_instr(clmul))]
#[inline]
pub unsafe fn clmul(rs1: usize, rs2: usize) -> usize {
#[cfg(target_arch = "riscv32")]
@@ -105,8 +103,7 @@ pub unsafe fn clmul(rs1: usize, rs2: usize) -> usize {
///
/// This function is safe to use if the `zbc` target feature is present.
#[target_feature(enable = "zbc")]
-// See #1464
-// #[cfg_attr(test, assert_instr(clmulh))]
+#[cfg_attr(test, assert_instr(clmulh))]
#[inline]
pub unsafe fn clmulh(rs1: usize, rs2: usize) -> usize {
#[cfg(target_arch = "riscv32")]
@@ -134,8 +131,7 @@ pub unsafe fn clmulh(rs1: usize, rs2: usize) -> usize {
///
/// This function is safe to use if the `zbc` target feature is present.
#[target_feature(enable = "zbc")]
-// See #1464
-// #[cfg_attr(test, assert_instr(clmulr))]
+#[cfg_attr(test, assert_instr(clmulr))]
#[inline]
pub unsafe fn clmulr(rs1: usize, rs2: usize) -> usize {
#[cfg(target_arch = "riscv32")]
diff --git a/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs b/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs
index db97f72bc..5fc5b4cda 100644
--- a/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs
+++ b/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs
@@ -62,8 +62,7 @@ extern "unadjusted" {
///
/// This function is safe to use if the `zbkx` target feature is present.
#[target_feature(enable = "zbkx")]
-// See #1464
-// #[cfg_attr(test, assert_instr(xperm8))]
+#[cfg_attr(test, assert_instr(xperm8))]
#[inline]
pub unsafe fn xperm8(rs1: usize, rs2: usize) -> usize {
#[cfg(target_arch = "riscv32")]
@@ -94,8 +93,7 @@ pub unsafe fn xperm8(rs1: usize, rs2: usize) -> usize {
///
/// This function is safe to use if the `zbkx` target feature is present.
#[target_feature(enable = "zbkx")]
-// See #1464
-// #[cfg_attr(test, assert_instr(xperm4))]
+#[cfg_attr(test, assert_instr(xperm4))]
#[inline]
pub unsafe fn xperm4(rs1: usize, rs2: usize) -> usize {
#[cfg(target_arch = "riscv32")]
@@ -129,8 +127,7 @@ pub unsafe fn xperm4(rs1: usize, rs2: usize) -> usize {
///
/// This function is safe to use if the `zknh` target feature is present.
#[target_feature(enable = "zknh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sha256sig0))]
+#[cfg_attr(test, assert_instr(sha256sig0))]
#[inline]
pub unsafe fn sha256sig0(rs1: u32) -> u32 {
_sha256sig0(rs1 as i32) as u32
@@ -156,8 +153,7 @@ pub unsafe fn sha256sig0(rs1: u32) -> u32 {
///
/// This function is safe to use if the `zknh` target feature is present.
#[target_feature(enable = "zknh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sha256sig1))]
+#[cfg_attr(test, assert_instr(sha256sig1))]
#[inline]
pub unsafe fn sha256sig1(rs1: u32) -> u32 {
_sha256sig1(rs1 as i32) as u32
@@ -183,8 +179,7 @@ pub unsafe fn sha256sig1(rs1: u32) -> u32 {
///
/// This function is safe to use if the `zknh` target feature is present.
#[target_feature(enable = "zknh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sha256sum0))]
+#[cfg_attr(test, assert_instr(sha256sum0))]
#[inline]
pub unsafe fn sha256sum0(rs1: u32) -> u32 {
_sha256sum0(rs1 as i32) as u32
@@ -210,8 +205,7 @@ pub unsafe fn sha256sum0(rs1: u32) -> u32 {
///
/// This function is safe to use if the `zknh` target feature is present.
#[target_feature(enable = "zknh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sha256sum1))]
+#[cfg_attr(test, assert_instr(sha256sum1))]
#[inline]
pub unsafe fn sha256sum1(rs1: u32) -> u32 {
_sha256sum1(rs1 as i32) as u32
@@ -288,8 +282,7 @@ pub unsafe fn sha256sum1(rs1: u32) -> u32 {
/// ```
#[target_feature(enable = "zksed")]
#[rustc_legacy_const_generics(2)]
-// See #1464
-// #[cfg_attr(test, assert_instr(sm4ed, BS = 0))]
+#[cfg_attr(test, assert_instr(sm4ed, BS = 0))]
#[inline]
pub unsafe fn sm4ed<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
static_assert!(BS < 4);
@@ -368,8 +361,7 @@ pub unsafe fn sm4ed<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
/// ```
#[target_feature(enable = "zksed")]
#[rustc_legacy_const_generics(2)]
-// See #1464
-// #[cfg_attr(test, assert_instr(sm4ks, BS = 0))]
+#[cfg_attr(test, assert_instr(sm4ks, BS = 0))]
#[inline]
pub unsafe fn sm4ks<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
static_assert!(BS < 4);
@@ -409,8 +401,7 @@ pub unsafe fn sm4ks<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
/// compression function `CF` uses the intermediate value `TT2` to calculate
/// the variable `E` in one iteration for subsequent processes.
#[target_feature(enable = "zksh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sm3p0))]
+#[cfg_attr(test, assert_instr(sm3p0))]
#[inline]
pub unsafe fn sm3p0(rs1: u32) -> u32 {
_sm3p0(rs1 as i32) as u32
@@ -454,8 +445,7 @@ pub unsafe fn sm3p0(rs1: u32) -> u32 {
/// ENDFOR
/// ```
#[target_feature(enable = "zksh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sm3p1))]
+#[cfg_attr(test, assert_instr(sm3p1))]
#[inline]
pub unsafe fn sm3p1(rs1: u32) -> u32 {
_sm3p1(rs1 as i32) as u32
diff --git a/library/stdarch/crates/core_arch/src/x86/avx.rs b/library/stdarch/crates/core_arch/src/x86/avx.rs
index 00bcc1fa1..de5dc05b8 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx.rs
@@ -268,7 +268,11 @@ pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
#[cfg_attr(test, assert_instr(vaddsubpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
- addsubpd256(a, b)
+ let a = a.as_f64x4();
+ let b = b.as_f64x4();
+ let add = simd_add(a, b);
+ let sub = simd_sub(a, b);
+ simd_shuffle!(add, sub, [4, 1, 6, 3])
}
/// Alternatively adds and subtracts packed single-precision (32-bit)
@@ -280,7 +284,11 @@ pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
#[cfg_attr(test, assert_instr(vaddsubps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
- addsubps256(a, b)
+ let a = a.as_f32x8();
+ let b = b.as_f32x8();
+ let add = simd_add(a, b);
+ let sub = simd_sub(a, b);
+ simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
}
/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
@@ -511,7 +519,8 @@ pub unsafe fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
#[cfg_attr(test, assert_instr(vblendvpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
- vblendvpd(a, b, c)
+ let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::splat(0));
+ transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
}
/// Blends packed single-precision (32-bit) floating-point elements from
@@ -523,7 +532,8 @@ pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
#[cfg_attr(test, assert_instr(vblendvps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
- vblendvps(a, b, c)
+ let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::splat(0));
+ transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
}
/// Conditionally multiplies the packed single-precision (32-bit) floating-point
@@ -2056,7 +2066,10 @@ pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
#[cfg_attr(test, assert_instr(vmovmskpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
- movmskpd256(a)
+ // Propagate the highest bit to the rest, because simd_bitmask
+ // requires all-1 or all-0.
+ let mask: i64x4 = simd_lt(transmute(a), i64x4::splat(0));
+ simd_bitmask::<i64x4, u8>(mask).into()
}
/// Sets each bit of the returned mask based on the most significant bit of the
@@ -2069,7 +2082,10 @@ pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
#[cfg_attr(test, assert_instr(vmovmskps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
- movmskps256(a)
+ // Propagate the highest bit to the rest, because simd_bitmask
+ // requires all-1 or all-0.
+ let mask: i32x8 = simd_lt(transmute(a), i32x8::splat(0));
+ simd_bitmask::<i32x8, u8>(mask).into()
}
/// Returns vector of type __m256d with all elements set to zero.
@@ -2904,20 +2920,12 @@ pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 {
// LLVM intrinsics used in the above functions
#[allow(improper_ctypes)]
extern "C" {
- #[link_name = "llvm.x86.avx.addsub.pd.256"]
- fn addsubpd256(a: __m256d, b: __m256d) -> __m256d;
- #[link_name = "llvm.x86.avx.addsub.ps.256"]
- fn addsubps256(a: __m256, b: __m256) -> __m256;
#[link_name = "llvm.x86.avx.round.pd.256"]
fn roundpd256(a: __m256d, b: i32) -> __m256d;
#[link_name = "llvm.x86.avx.round.ps.256"]
fn roundps256(a: __m256, b: i32) -> __m256;
#[link_name = "llvm.x86.avx.sqrt.ps.256"]
fn sqrtps256(a: __m256) -> __m256;
- #[link_name = "llvm.x86.avx.blendv.pd.256"]
- fn vblendvpd(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
- #[link_name = "llvm.x86.avx.blendv.ps.256"]
- fn vblendvps(a: __m256, b: __m256, c: __m256) -> __m256;
#[link_name = "llvm.x86.avx.dp.ps.256"]
fn vdpps(a: __m256, b: __m256, imm8: i32) -> __m256;
#[link_name = "llvm.x86.avx.hadd.pd.256"]
@@ -3026,10 +3034,6 @@ extern "C" {
fn vtestcps(a: __m128, b: __m128) -> i32;
#[link_name = "llvm.x86.avx.vtestnzc.ps"]
fn vtestnzcps(a: __m128, b: __m128) -> i32;
- #[link_name = "llvm.x86.avx.movmsk.pd.256"]
- fn movmskpd256(a: __m256d) -> i32;
- #[link_name = "llvm.x86.avx.movmsk.ps.256"]
- fn movmskps256(a: __m256) -> i32;
#[link_name = "llvm.x86.avx.min.ps.256"]
fn vminps(a: __m256, b: __m256) -> __m256;
#[link_name = "llvm.x86.avx.max.ps.256"]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs
index e23c795ee..243a4cdab 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs
@@ -344,7 +344,10 @@ pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpavgw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
- transmute(pavgw(a.as_u16x16(), b.as_u16x16()))
+ let a = simd_cast::<_, u32x16>(a.as_u16x16());
+ let b = simd_cast::<_, u32x16>(b.as_u16x16());
+ let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
+ transmute(simd_cast::<_, u16x16>(r))
}
/// Averages packed unsigned 8-bit integers in `a` and `b`.
@@ -355,7 +358,10 @@ pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpavgb))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
- transmute(pavgb(a.as_u8x32(), b.as_u8x32()))
+ let a = simd_cast::<_, u16x32>(a.as_u8x32());
+ let b = simd_cast::<_, u16x32>(b.as_u8x32());
+ let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
+ transmute(simd_cast::<_, u8x32>(r))
}
/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
@@ -458,7 +464,8 @@ pub unsafe fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
#[cfg_attr(test, assert_instr(vpblendvb))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
- transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32()))
+ let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::splat(0));
+ transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
}
/// Broadcasts the low packed 8-bit integer from `a` to all elements of
@@ -2060,7 +2067,9 @@ pub unsafe fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __
#[cfg_attr(test, assert_instr(vpmuldq))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
- transmute(pmuldq(a.as_i32x8(), b.as_i32x8()))
+ let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4()));
+ let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4()));
+ transmute(simd_mul(a, b))
}
/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
@@ -2074,7 +2083,10 @@ pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpmuludq))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
- transmute(pmuludq(a.as_u32x8(), b.as_u32x8()))
+ let a = a.as_u64x4();
+ let b = b.as_u64x4();
+ let mask = u64x4::splat(u32::MAX.into());
+ transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
}
/// Multiplies the packed 16-bit integers in `a` and `b`, producing
@@ -2087,7 +2099,10 @@ pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpmulhw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
- transmute(pmulhw(a.as_i16x16(), b.as_i16x16()))
+ let a = simd_cast::<_, i32x16>(a.as_i16x16());
+ let b = simd_cast::<_, i32x16>(b.as_i16x16());
+ let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
+ transmute(simd_cast::<i32x16, i16x16>(r))
}
/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
@@ -2100,7 +2115,10 @@ pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpmulhuw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
- transmute(pmulhuw(a.as_u16x16(), b.as_u16x16()))
+ let a = simd_cast::<_, u32x16>(a.as_u16x16());
+ let b = simd_cast::<_, u32x16>(b.as_u16x16());
+ let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
+ transmute(simd_cast::<u32x16, u16x16>(r))
}
/// Multiplies the packed 16-bit integers in `a` and `b`, producing
@@ -3629,12 +3647,6 @@ extern "C" {
fn pabsw(a: i16x16) -> u16x16;
#[link_name = "llvm.x86.avx2.pabs.d"]
fn pabsd(a: i32x8) -> u32x8;
- #[link_name = "llvm.x86.avx2.pavg.b"]
- fn pavgb(a: u8x32, b: u8x32) -> u8x32;
- #[link_name = "llvm.x86.avx2.pavg.w"]
- fn pavgw(a: u16x16, b: u16x16) -> u16x16;
- #[link_name = "llvm.x86.avx2.pblendvb"]
- fn pblendvb(a: i8x32, b: i8x32, mask: i8x32) -> i8x32;
#[link_name = "llvm.x86.avx2.phadd.w"]
fn phaddw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.phadd.d"]
@@ -3669,14 +3681,6 @@ extern "C" {
fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
#[link_name = "llvm.x86.avx2.mpsadbw"]
fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
- #[link_name = "llvm.x86.avx2.pmulhu.w"]
- fn pmulhuw(a: u16x16, b: u16x16) -> u16x16;
- #[link_name = "llvm.x86.avx2.pmulh.w"]
- fn pmulhw(a: i16x16, b: i16x16) -> i16x16;
- #[link_name = "llvm.x86.avx2.pmul.dq"]
- fn pmuldq(a: i32x8, b: i32x8) -> i64x4;
- #[link_name = "llvm.x86.avx2.pmulu.dq"]
- fn pmuludq(a: u32x8, b: u32x8) -> u64x4;
#[link_name = "llvm.x86.avx2.pmul.hr.sw"]
fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.packsswb"]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs b/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
index 92e572eb1..ce4e402a8 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
@@ -311,7 +311,7 @@ pub unsafe fn _mm_mask_popcnt_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __
#[target_feature(enable = "avx512bitalg")]
#[cfg_attr(test, assert_instr(vpshufbitqmb))]
pub unsafe fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64 {
- transmute(bitshuffle_512(b.as_i8x64(), c.as_i8x64(), !0))
+ bitshuffle_512(b.as_i8x64(), c.as_i8x64(), !0)
}
/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -326,7 +326,7 @@ pub unsafe fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64
#[target_feature(enable = "avx512bitalg")]
#[cfg_attr(test, assert_instr(vpshufbitqmb))]
pub unsafe fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m512i) -> __mmask64 {
- transmute(bitshuffle_512(b.as_i8x64(), c.as_i8x64(), k))
+ bitshuffle_512(b.as_i8x64(), c.as_i8x64(), k)
}
/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -338,7 +338,7 @@ pub unsafe fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m
#[target_feature(enable = "avx512bitalg,avx512vl")]
#[cfg_attr(test, assert_instr(vpshufbitqmb))]
pub unsafe fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32 {
- transmute(bitshuffle_256(b.as_i8x32(), c.as_i8x32(), !0))
+ bitshuffle_256(b.as_i8x32(), c.as_i8x32(), !0)
}
/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -353,7 +353,7 @@ pub unsafe fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32
#[target_feature(enable = "avx512bitalg,avx512vl")]
#[cfg_attr(test, assert_instr(vpshufbitqmb))]
pub unsafe fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m256i) -> __mmask32 {
- transmute(bitshuffle_256(b.as_i8x32(), c.as_i8x32(), k))
+ bitshuffle_256(b.as_i8x32(), c.as_i8x32(), k)
}
/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -365,7 +365,7 @@ pub unsafe fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m
#[target_feature(enable = "avx512bitalg,avx512vl")]
#[cfg_attr(test, assert_instr(vpshufbitqmb))]
pub unsafe fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
- transmute(bitshuffle_128(b.as_i8x16(), c.as_i8x16(), !0))
+ bitshuffle_128(b.as_i8x16(), c.as_i8x16(), !0)
}
/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -380,7 +380,7 @@ pub unsafe fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
#[target_feature(enable = "avx512bitalg,avx512vl")]
#[cfg_attr(test, assert_instr(vpshufbitqmb))]
pub unsafe fn _mm_mask_bitshuffle_epi64_mask(k: __mmask16, b: __m128i, c: __m128i) -> __mmask16 {
- transmute(bitshuffle_128(b.as_i8x16(), c.as_i8x16(), k))
+ bitshuffle_128(b.as_i8x16(), c.as_i8x16(), k)
}
#[cfg(test)]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
index 364023539..0b4a56d36 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
@@ -3703,8 +3703,7 @@ pub unsafe fn _mm512_cmp_epu16_mask<const IMM8: i32>(a: __m512i, b: __m512i) ->
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u16x32();
let b = b.as_u16x32();
- let r = vpcmpuw(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
- transmute(r)
+ vpcmpuw(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
}
/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3722,8 +3721,7 @@ pub unsafe fn _mm512_mask_cmp_epu16_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u16x32();
let b = b.as_u16x32();
- let r = vpcmpuw(a, b, IMM8, k1);
- transmute(r)
+ vpcmpuw(a, b, IMM8, k1)
}
/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3737,8 +3735,7 @@ pub unsafe fn _mm256_cmp_epu16_mask<const IMM8: i32>(a: __m256i, b: __m256i) ->
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u16x16();
let b = b.as_u16x16();
- let r = vpcmpuw256(a, b, IMM8, 0b11111111_11111111);
- transmute(r)
+ vpcmpuw256(a, b, IMM8, 0b11111111_11111111)
}
/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3756,8 +3753,7 @@ pub unsafe fn _mm256_mask_cmp_epu16_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u16x16();
let b = b.as_u16x16();
- let r = vpcmpuw256(a, b, IMM8, k1);
- transmute(r)
+ vpcmpuw256(a, b, IMM8, k1)
}
/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3771,8 +3767,7 @@ pub unsafe fn _mm_cmp_epu16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __m
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u16x8();
let b = b.as_u16x8();
- let r = vpcmpuw128(a, b, IMM8, 0b11111111);
- transmute(r)
+ vpcmpuw128(a, b, IMM8, 0b11111111)
}
/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3790,8 +3785,7 @@ pub unsafe fn _mm_mask_cmp_epu16_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u16x8();
let b = b.as_u16x8();
- let r = vpcmpuw128(a, b, IMM8, k1);
- transmute(r)
+ vpcmpuw128(a, b, IMM8, k1)
}
/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3805,13 +3799,12 @@ pub unsafe fn _mm512_cmp_epu8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> _
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u8x64();
let b = b.as_u8x64();
- let r = vpcmpub(
+ vpcmpub(
a,
b,
IMM8,
0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
- );
- transmute(r)
+ )
}
/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3829,8 +3822,7 @@ pub unsafe fn _mm512_mask_cmp_epu8_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u8x64();
let b = b.as_u8x64();
- let r = vpcmpub(a, b, IMM8, k1);
- transmute(r)
+ vpcmpub(a, b, IMM8, k1)
}
/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3844,8 +3836,7 @@ pub unsafe fn _mm256_cmp_epu8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> _
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u8x32();
let b = b.as_u8x32();
- let r = vpcmpub256(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
- transmute(r)
+ vpcmpub256(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
}
/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3863,8 +3854,7 @@ pub unsafe fn _mm256_mask_cmp_epu8_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u8x32();
let b = b.as_u8x32();
- let r = vpcmpub256(a, b, IMM8, k1);
- transmute(r)
+ vpcmpub256(a, b, IMM8, k1)
}
/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3878,8 +3868,7 @@ pub unsafe fn _mm_cmp_epu8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mm
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u8x16();
let b = b.as_u8x16();
- let r = vpcmpub128(a, b, IMM8, 0b11111111_11111111);
- transmute(r)
+ vpcmpub128(a, b, IMM8, 0b11111111_11111111)
}
/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3897,8 +3886,7 @@ pub unsafe fn _mm_mask_cmp_epu8_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u8x16();
let b = b.as_u8x16();
- let r = vpcmpub128(a, b, IMM8, k1);
- transmute(r)
+ vpcmpub128(a, b, IMM8, k1)
}
/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3912,8 +3900,7 @@ pub unsafe fn _mm512_cmp_epi16_mask<const IMM8: i32>(a: __m512i, b: __m512i) ->
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i16x32();
let b = b.as_i16x32();
- let r = vpcmpw(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
- transmute(r)
+ vpcmpw(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
}
/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3931,8 +3918,7 @@ pub unsafe fn _mm512_mask_cmp_epi16_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i16x32();
let b = b.as_i16x32();
- let r = vpcmpw(a, b, IMM8, k1);
- transmute(r)
+ vpcmpw(a, b, IMM8, k1)
}
/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3946,8 +3932,7 @@ pub unsafe fn _mm256_cmp_epi16_mask<const IMM8: i32>(a: __m256i, b: __m256i) ->
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i16x16();
let b = b.as_i16x16();
- let r = vpcmpw256(a, b, IMM8, 0b11111111_11111111);
- transmute(r)
+ vpcmpw256(a, b, IMM8, 0b11111111_11111111)
}
/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3965,8 +3950,7 @@ pub unsafe fn _mm256_mask_cmp_epi16_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i16x16();
let b = b.as_i16x16();
- let r = vpcmpw256(a, b, IMM8, k1);
- transmute(r)
+ vpcmpw256(a, b, IMM8, k1)
}
/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3980,8 +3964,7 @@ pub unsafe fn _mm_cmp_epi16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __m
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i16x8();
let b = b.as_i16x8();
- let r = vpcmpw128(a, b, IMM8, 0b11111111);
- transmute(r)
+ vpcmpw128(a, b, IMM8, 0b11111111)
}
/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3999,8 +3982,7 @@ pub unsafe fn _mm_mask_cmp_epi16_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i16x8();
let b = b.as_i16x8();
- let r = vpcmpw128(a, b, IMM8, k1);
- transmute(r)
+ vpcmpw128(a, b, IMM8, k1)
}
/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4014,13 +3996,12 @@ pub unsafe fn _mm512_cmp_epi8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> _
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i8x64();
let b = b.as_i8x64();
- let r = vpcmpb(
+ vpcmpb(
a,
b,
IMM8,
0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
- );
- transmute(r)
+ )
}
/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4038,8 +4019,7 @@ pub unsafe fn _mm512_mask_cmp_epi8_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i8x64();
let b = b.as_i8x64();
- let r = vpcmpb(a, b, IMM8, k1);
- transmute(r)
+ vpcmpb(a, b, IMM8, k1)
}
/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4053,8 +4033,7 @@ pub unsafe fn _mm256_cmp_epi8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> _
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i8x32();
let b = b.as_i8x32();
- let r = vpcmpb256(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
- transmute(r)
+ vpcmpb256(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
}
/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4072,8 +4051,7 @@ pub unsafe fn _mm256_mask_cmp_epi8_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i8x32();
let b = b.as_i8x32();
- let r = vpcmpb256(a, b, IMM8, k1);
- transmute(r)
+ vpcmpb256(a, b, IMM8, k1)
}
/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4087,8 +4065,7 @@ pub unsafe fn _mm_cmp_epi8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mm
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i8x16();
let b = b.as_i8x16();
- let r = vpcmpb128(a, b, IMM8, 0b11111111_11111111);
- transmute(r)
+ vpcmpb128(a, b, IMM8, 0b11111111_11111111)
}
/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4106,8 +4083,7 @@ pub unsafe fn _mm_mask_cmp_epi8_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i8x16();
let b = b.as_i8x16();
- let r = vpcmpb128(a, b, IMM8, k1);
- transmute(r)
+ vpcmpb128(a, b, IMM8, k1)
}
/// Load 512-bits (composed of 32 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
@@ -8566,7 +8542,7 @@ pub unsafe fn _mm_movm_epi8(k: __mmask16) -> __m128i {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
- transmute(a + b)
+ a + b
}
/// Add 64-bit masks in a and b, and store the result in k.
@@ -8575,7 +8551,7 @@ pub unsafe fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
- transmute(a + b)
+ a + b
}
/// Compute the bitwise AND of 32-bit masks a and b, and store the result in k.
@@ -8584,7 +8560,7 @@ pub unsafe fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
- transmute(a & b)
+ a & b
}
/// Compute the bitwise AND of 64-bit masks a and b, and store the result in k.
@@ -8593,7 +8569,7 @@ pub unsafe fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
- transmute(a & b)
+ a & b
}
/// Compute the bitwise NOT of 32-bit mask a, and store the result in k.
@@ -8602,7 +8578,7 @@ pub unsafe fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _knot_mask32(a: __mmask32) -> __mmask32 {
- transmute(a ^ 0b11111111_11111111_11111111_11111111)
+ a ^ 0b11111111_11111111_11111111_11111111
}
/// Compute the bitwise NOT of 64-bit mask a, and store the result in k.
@@ -8611,7 +8587,7 @@ pub unsafe fn _knot_mask32(a: __mmask32) -> __mmask32 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _knot_mask64(a: __mmask64) -> __mmask64 {
- transmute(a ^ 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111)
+ a ^ 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
}
/// Compute the bitwise NOT of 32-bit masks a and then AND with b, and store the result in k.
@@ -8620,7 +8596,7 @@ pub unsafe fn _knot_mask64(a: __mmask64) -> __mmask64 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
- transmute(_knot_mask32(a) & b)
+ _knot_mask32(a) & b
}
/// Compute the bitwise NOT of 64-bit masks a and then AND with b, and store the result in k.
@@ -8629,7 +8605,7 @@ pub unsafe fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
- transmute(_knot_mask64(a) & b)
+ _knot_mask64(a) & b
}
/// Compute the bitwise OR of 32-bit masks a and b, and store the result in k.
@@ -8638,7 +8614,7 @@ pub unsafe fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
- transmute(a | b)
+ a | b
}
/// Compute the bitwise OR of 64-bit masks a and b, and store the result in k.
@@ -8647,7 +8623,7 @@ pub unsafe fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
- transmute(a | b)
+ a | b
}
/// Compute the bitwise XOR of 32-bit masks a and b, and store the result in k.
@@ -8656,7 +8632,7 @@ pub unsafe fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
- transmute(a ^ b)
+ a ^ b
}
/// Compute the bitwise XOR of 64-bit masks a and b, and store the result in k.
@@ -8665,7 +8641,7 @@ pub unsafe fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
- transmute(a ^ b)
+ a ^ b
}
/// Compute the bitwise XNOR of 32-bit masks a and b, and store the result in k.
@@ -8674,7 +8650,7 @@ pub unsafe fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
- transmute(_knot_mask32(a ^ b))
+ _knot_mask32(a ^ b)
}
/// Compute the bitwise XNOR of 64-bit masks a and b, and store the result in k.
@@ -8683,7 +8659,7 @@ pub unsafe fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kxnor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
- transmute(_knot_mask64(a ^ b))
+ _knot_mask64(a ^ b)
}
/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
index 5412237ca..280135292 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
@@ -17144,7 +17144,7 @@ pub unsafe fn _mm512_slli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
if IMM8 >= 32 {
_mm512_setzero_si512()
} else {
- transmute(simd_shl(a.as_u32x16(), u32x16::splat(IMM8 as u32)))
+ transmute(simd_shl(a.as_u32x16(), u32x16::splat(IMM8)))
}
}
@@ -20132,7 +20132,7 @@ pub unsafe fn _mm512_maskz_permutexvar_epi32(k: __mmask16, idx: __m512i, a: __m5
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //should be vpermd
pub unsafe fn _mm256_permutexvar_epi32(idx: __m256i, a: __m256i) -> __m256i {
- transmute(_mm256_permutevar8x32_epi32(a, idx)) // llvm use llvm.x86.avx2.permd
+ _mm256_permutevar8x32_epi32(a, idx) // llvm use llvm.x86.avx2.permd
}
/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20284,7 +20284,7 @@ pub unsafe fn _mm512_maskz_permutexvar_ps(k: __mmask16, idx: __m512i, a: __m512)
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermps))]
pub unsafe fn _mm256_permutexvar_ps(idx: __m256i, a: __m256) -> __m256 {
- transmute(_mm256_permutevar8x32_ps(a, idx)) //llvm.x86.avx2.permps
+ _mm256_permutevar8x32_ps(a, idx) //llvm.x86.avx2.permps
}
/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23943,7 +23943,7 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(vmovd))]
pub unsafe fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
let extract: i32 = simd_extract(a.as_i32x16(), 0);
- transmute(extract)
+ extract
}
/// Broadcast the low packed 32-bit integer from a to all elements of dst.
@@ -25744,7 +25744,7 @@ pub unsafe fn _mm512_andnot_si512(a: __m512i, b: __m512i) -> __m512i {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
- transmute(a & b)
+ a & b
}
/// Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
@@ -25754,7 +25754,7 @@ pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 {
- transmute(a & b)
+ a & b
}
/// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
@@ -25764,7 +25764,7 @@ pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
- transmute(a | b)
+ a | b
}
/// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
@@ -25774,7 +25774,7 @@ pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 {
- transmute(a | b)
+ a | b
}
/// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
@@ -25784,7 +25784,7 @@ pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
- transmute(a ^ b)
+ a ^ b
}
/// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
@@ -25794,7 +25794,7 @@ pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
pub unsafe fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 {
- transmute(a ^ b)
+ a ^ b
}
/// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
@@ -25803,7 +25803,7 @@ pub unsafe fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _knot_mask16(a: __mmask16) -> __mmask16 {
- transmute(a ^ 0b11111111_11111111)
+ a ^ 0b11111111_11111111
}
/// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
@@ -25812,7 +25812,7 @@ pub unsafe fn _knot_mask16(a: __mmask16) -> __mmask16 {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_knot(a: __mmask16) -> __mmask16 {
- transmute(a ^ 0b11111111_11111111)
+ a ^ 0b11111111_11111111
}
/// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k.
@@ -25862,8 +25862,7 @@ pub unsafe fn _mm512_kxnor(a: __mmask16, b: __mmask16) -> __mmask16 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 {
- let r: u16 = a;
- transmute(r)
+ a
}
/// Converts integer mask into bitmask, storing the result in dst.
@@ -25872,8 +25871,7 @@ pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 {
#[inline]
#[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
- let r: u16 = mask as u16;
- transmute(r)
+ mask as u16
}
/// Converts bit mask k1 into an integer value, storing the results in dst.
@@ -25883,8 +25881,7 @@ pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
- let r: i32 = k1 as i32;
- transmute(r)
+ k1 as i32
}
/// Unpack and interleave 8 bits from masks a and b, and store the 16-bit result in k.
@@ -25896,7 +25893,7 @@ pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
pub unsafe fn _mm512_kunpackb(a: __mmask16, b: __mmask16) -> __mmask16 {
let a = a & 0b00000000_11111111;
let b = b & 0b11111111_00000000;
- transmute(a | b)
+ a | b
}
/// Performs bitwise OR between k1 and k2, storing the result in dst. CF flag is set if dst consists of all 1's.
@@ -32352,8 +32349,7 @@ pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -
if (k & 0b00000001) != 0 {
mov = simd_extract(b, 0);
}
- let r = simd_insert(a, 0, mov);
- transmute(r)
+ simd_insert(a, 0, mov)
}
/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32367,8 +32363,7 @@ pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
if (k & 0b00000001) != 0 {
mov = simd_extract(b, 0);
}
- let r = simd_insert(a, 0, mov);
- transmute(r)
+ simd_insert(a, 0, mov)
}
/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32383,8 +32378,7 @@ pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d
if (k & 0b00000001) != 0 {
mov = simd_extract(b, 0);
}
- let r = simd_insert(a, 0, mov);
- transmute(r)
+ simd_insert(a, 0, mov)
}
/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32398,8 +32392,7 @@ pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d
if (k & 0b00000001) != 0 {
mov = simd_extract(b, 0);
}
- let r = simd_insert(a, 0, mov);
- transmute(r)
+ simd_insert(a, 0, mov)
}
/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32416,8 +32409,7 @@ pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
let extractb: f32 = simd_extract(b, 0);
add = extracta + extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32433,8 +32425,7 @@ pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
let extractb: f32 = simd_extract(b, 0);
add = extracta + extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32451,8 +32442,7 @@ pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
let extractb: f64 = simd_extract(b, 0);
add = extracta + extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32468,8 +32458,7 @@ pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
let extractb: f64 = simd_extract(b, 0);
add = extracta + extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32486,8 +32475,7 @@ pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
let extractb: f32 = simd_extract(b, 0);
add = extracta - extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32503,8 +32491,7 @@ pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
let extractb: f32 = simd_extract(b, 0);
add = extracta - extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32521,8 +32508,7 @@ pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
let extractb: f64 = simd_extract(b, 0);
add = extracta - extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32538,8 +32524,7 @@ pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
let extractb: f64 = simd_extract(b, 0);
add = extracta - extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32556,8 +32541,7 @@ pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
let extractb: f32 = simd_extract(b, 0);
add = extracta * extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32573,8 +32557,7 @@ pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
let extractb: f32 = simd_extract(b, 0);
add = extracta * extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32591,8 +32574,7 @@ pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
let extractb: f64 = simd_extract(b, 0);
add = extracta * extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32608,8 +32590,7 @@ pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
let extractb: f64 = simd_extract(b, 0);
add = extracta * extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32626,8 +32607,7 @@ pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
let extractb: f32 = simd_extract(b, 0);
add = extracta / extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32643,8 +32623,7 @@ pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
let extractb: f32 = simd_extract(b, 0);
add = extracta / extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32661,8 +32640,7 @@ pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
let extractb: f64 = simd_extract(b, 0);
add = extracta / extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32678,8 +32656,7 @@ pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
let extractb: f64 = simd_extract(b, 0);
add = extracta / extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33587,8 +33564,7 @@ pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) ->
let extractc: f32 = simd_extract(c, 0);
fmadd = vfmadd132ss(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fmadd);
- transmute(r)
+ simd_insert(a, 0, fmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33605,8 +33581,7 @@ pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -
let extractc: f32 = simd_extract(c, 0);
fmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fmadd);
- transmute(r)
+ simd_insert(a, 0, fmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -33622,8 +33597,7 @@ pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -
let extractb: f32 = simd_extract(b, 0);
fmadd = vfmadd132ss(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(c, 0, fmadd);
- transmute(r)
+ simd_insert(c, 0, fmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33639,8 +33613,7 @@ pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d)
let extractc: f64 = simd_extract(c, 0);
fmadd = vfmadd132sd(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fmadd);
- transmute(r)
+ simd_insert(a, 0, fmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33657,8 +33630,7 @@ pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
let extractc: f64 = simd_extract(c, 0);
fmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fmadd);
- transmute(r)
+ simd_insert(a, 0, fmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -33674,8 +33646,7 @@ pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
let extractb: f64 = simd_extract(b, 0);
fmadd = vfmadd132sd(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(c, 0, fmadd);
- transmute(r)
+ simd_insert(c, 0, fmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33692,8 +33663,7 @@ pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) ->
let extractc = -extractc;
fmsub = vfmadd132ss(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33711,8 +33681,7 @@ pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -
let extractc = -extractc;
fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -33729,8 +33698,7 @@ pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -
let extractc = -fmsub;
fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(c, 0, fmsub);
- transmute(r)
+ simd_insert(c, 0, fmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33747,8 +33715,7 @@ pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d)
let extractc = -extractc;
fmsub = vfmadd132sd(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33766,8 +33733,7 @@ pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
let extractc = -extractc;
fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -33784,8 +33750,7 @@ pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
let extractc = -fmsub;
fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(c, 0, fmsub);
- transmute(r)
+ simd_insert(c, 0, fmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33802,8 +33767,7 @@ pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -
let extractc: f32 = simd_extract(c, 0);
fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33821,8 +33785,7 @@ pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128)
let extractc: f32 = simd_extract(c, 0);
fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -33839,8 +33802,7 @@ pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8)
let extractb: f32 = simd_extract(b, 0);
fnmadd = vfmadd132ss(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(c, 0, fnmadd);
- transmute(r)
+ simd_insert(c, 0, fnmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33857,8 +33819,7 @@ pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
let extractc: f64 = simd_extract(c, 0);
fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33876,8 +33837,7 @@ pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
let extractc: f64 = simd_extract(c, 0);
fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -33894,8 +33854,7 @@ pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
let extractb: f64 = simd_extract(b, 0);
fnmadd = vfmadd132sd(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(c, 0, fnmadd);
- transmute(r)
+ simd_insert(c, 0, fnmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33913,8 +33872,7 @@ pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -
let extractc = -extractc;
fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33933,8 +33891,7 @@ pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128)
let extractc = -extractc;
fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -33952,8 +33909,7 @@ pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8)
let extractc = -fnmsub;
fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(c, 0, fnmsub);
- transmute(r)
+ simd_insert(c, 0, fnmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33971,8 +33927,7 @@ pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
let extractc = -extractc;
fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33991,8 +33946,7 @@ pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
let extractc = -extractc;
fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -34010,8 +33964,7 @@ pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
let extractc = -fnmsub;
fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(c, 0, fnmsub);
- transmute(r)
+ simd_insert(c, 0, fnmsub)
}
/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35705,8 +35658,7 @@ pub unsafe fn _mm_fmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: _
let extractb: f32 = simd_extract(b, 0);
let extractc: f32 = simd_extract(c, 0);
let r = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
- let r = simd_insert(a, 0, r);
- transmute(r)
+ simd_insert(a, 0, r)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35736,8 +35688,7 @@ pub unsafe fn _mm_mask_fmadd_round_ss<const ROUNDING: i32>(
let extractc: f32 = simd_extract(c, 0);
fmadd = vfmadd132ss(fmadd, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fmadd);
- transmute(r)
+ simd_insert(a, 0, fmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35768,8 +35719,7 @@ pub unsafe fn _mm_maskz_fmadd_round_ss<const ROUNDING: i32>(
let extractc: f32 = simd_extract(c, 0);
fmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fmadd);
- transmute(r)
+ simd_insert(a, 0, fmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -35799,8 +35749,7 @@ pub unsafe fn _mm_mask3_fmadd_round_ss<const ROUNDING: i32>(
let extractb: f32 = simd_extract(b, 0);
fmadd = vfmadd132ss(extracta, extractb, fmadd, ROUNDING);
}
- let r = simd_insert(c, 0, fmadd);
- transmute(r)
+ simd_insert(c, 0, fmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -35827,8 +35776,7 @@ pub unsafe fn _mm_fmadd_round_sd<const ROUNDING: i32>(
let extractb: f64 = simd_extract(b, 0);
let extractc: f64 = simd_extract(c, 0);
let fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
- let r = simd_insert(a, 0, fmadd);
- transmute(r)
+ simd_insert(a, 0, fmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -35858,8 +35806,7 @@ pub unsafe fn _mm_mask_fmadd_round_sd<const ROUNDING: i32>(
let extractc: f64 = simd_extract(c, 0);
fmadd = vfmadd132sd(fmadd, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fmadd);
- transmute(r)
+ simd_insert(a, 0, fmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -35890,8 +35837,7 @@ pub unsafe fn _mm_maskz_fmadd_round_sd<const ROUNDING: i32>(
let extractc: f64 = simd_extract(c, 0);
fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fmadd);
- transmute(r)
+ simd_insert(a, 0, fmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -35921,8 +35867,7 @@ pub unsafe fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
let extractb: f64 = simd_extract(b, 0);
fmadd = vfmadd132sd(extracta, extractb, fmadd, ROUNDING);
}
- let r = simd_insert(c, 0, fmadd);
- transmute(r)
+ simd_insert(c, 0, fmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35946,8 +35891,7 @@ pub unsafe fn _mm_fmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: _
let extractc: f32 = simd_extract(c, 0);
let extractc = -extractc;
let fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35978,8 +35922,7 @@ pub unsafe fn _mm_mask_fmsub_round_ss<const ROUNDING: i32>(
let extractc = -extractc;
fmsub = vfmadd132ss(fmsub, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36011,8 +35954,7 @@ pub unsafe fn _mm_maskz_fmsub_round_ss<const ROUNDING: i32>(
let extractc = -extractc;
fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -36043,8 +35985,7 @@ pub unsafe fn _mm_mask3_fmsub_round_ss<const ROUNDING: i32>(
let extractc = -fmsub;
fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(c, 0, fmsub);
- transmute(r)
+ simd_insert(c, 0, fmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -36072,8 +36013,7 @@ pub unsafe fn _mm_fmsub_round_sd<const ROUNDING: i32>(
let extractc: f64 = simd_extract(c, 0);
let extractc = -extractc;
let fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36104,8 +36044,7 @@ pub unsafe fn _mm_mask_fmsub_round_sd<const ROUNDING: i32>(
let extractc = -extractc;
fmsub = vfmadd132sd(fmsub, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36137,8 +36076,7 @@ pub unsafe fn _mm_maskz_fmsub_round_sd<const ROUNDING: i32>(
let extractc = -extractc;
fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -36169,8 +36107,7 @@ pub unsafe fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
let extractc = -fmsub;
fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(c, 0, fmsub);
- transmute(r)
+ simd_insert(c, 0, fmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36194,8 +36131,7 @@ pub unsafe fn _mm_fnmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c:
let extractb: f32 = simd_extract(b, 0);
let extractc: f32 = simd_extract(c, 0);
let fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36226,8 +36162,7 @@ pub unsafe fn _mm_mask_fnmadd_round_ss<const ROUNDING: i32>(
let extractc: f32 = simd_extract(c, 0);
fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36259,8 +36194,7 @@ pub unsafe fn _mm_maskz_fnmadd_round_ss<const ROUNDING: i32>(
let extractc: f32 = simd_extract(c, 0);
fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -36291,8 +36225,7 @@ pub unsafe fn _mm_mask3_fnmadd_round_ss<const ROUNDING: i32>(
let extractb: f32 = simd_extract(b, 0);
fnmadd = vfmadd132ss(extracta, extractb, fnmadd, ROUNDING);
}
- let r = simd_insert(c, 0, fnmadd);
- transmute(r)
+ simd_insert(c, 0, fnmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -36320,8 +36253,7 @@ pub unsafe fn _mm_fnmadd_round_sd<const ROUNDING: i32>(
let extractb: f64 = simd_extract(b, 0);
let extractc: f64 = simd_extract(c, 0);
let fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36352,8 +36284,7 @@ pub unsafe fn _mm_mask_fnmadd_round_sd<const ROUNDING: i32>(
let extractc: f64 = simd_extract(c, 0);
fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36385,8 +36316,7 @@ pub unsafe fn _mm_maskz_fnmadd_round_sd<const ROUNDING: i32>(
let extractc: f64 = simd_extract(c, 0);
fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -36417,8 +36347,7 @@ pub unsafe fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
let extractb: f64 = simd_extract(b, 0);
fnmadd = vfmadd132sd(extracta, extractb, fnmadd, ROUNDING);
}
- let r = simd_insert(c, 0, fnmadd);
- transmute(r)
+ simd_insert(c, 0, fnmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36443,8 +36372,7 @@ pub unsafe fn _mm_fnmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c:
let extractc: f32 = simd_extract(c, 0);
let extractc = -extractc;
let fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36476,8 +36404,7 @@ pub unsafe fn _mm_mask_fnmsub_round_ss<const ROUNDING: i32>(
let extractc = -extractc;
fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36510,8 +36437,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_ss<const ROUNDING: i32>(
let extractc = -extractc;
fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -36543,8 +36469,7 @@ pub unsafe fn _mm_mask3_fnmsub_round_ss<const ROUNDING: i32>(
let extractc = -fnmsub;
fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(c, 0, fnmsub);
- transmute(r)
+ simd_insert(c, 0, fnmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -36573,8 +36498,7 @@ pub unsafe fn _mm_fnmsub_round_sd<const ROUNDING: i32>(
let extractc: f64 = simd_extract(c, 0);
let extractc = -extractc;
let fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36606,8 +36530,7 @@ pub unsafe fn _mm_mask_fnmsub_round_sd<const ROUNDING: i32>(
let extractc = -extractc;
fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36640,8 +36563,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_sd<const ROUNDING: i32>(
let extractc = -extractc;
fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -36673,8 +36595,7 @@ pub unsafe fn _mm_mask3_fnmsub_round_sd<const ROUNDING: i32>(
let extractc = -fnmsub;
fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(c, 0, fnmsub);
- transmute(r)
+ simd_insert(c, 0, fnmsub)
}
/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
@@ -37168,8 +37089,7 @@ pub unsafe fn _mm_maskz_cvt_roundsd_ss<const ROUNDING: i32>(
pub unsafe fn _mm_cvt_roundss_si32<const ROUNDING: i32>(a: __m128) -> i32 {
static_assert_rounding!(ROUNDING);
let a = a.as_f32x4();
- let r = vcvtss2si(a, ROUNDING);
- transmute(r)
+ vcvtss2si(a, ROUNDING)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
@@ -37188,8 +37108,7 @@ pub unsafe fn _mm_cvt_roundss_si32<const ROUNDING: i32>(a: __m128) -> i32 {
pub unsafe fn _mm_cvt_roundss_i32<const ROUNDING: i32>(a: __m128) -> i32 {
static_assert_rounding!(ROUNDING);
let a = a.as_f32x4();
- let r = vcvtss2si(a, ROUNDING);
- transmute(r)
+ vcvtss2si(a, ROUNDING)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
@@ -37208,8 +37127,7 @@ pub unsafe fn _mm_cvt_roundss_i32<const ROUNDING: i32>(a: __m128) -> i32 {
pub unsafe fn _mm_cvt_roundss_u32<const ROUNDING: i32>(a: __m128) -> u32 {
static_assert_rounding!(ROUNDING);
let a = a.as_f32x4();
- let r = vcvtss2usi(a, ROUNDING);
- transmute(r)
+ vcvtss2usi(a, ROUNDING)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
@@ -37219,7 +37137,7 @@ pub unsafe fn _mm_cvt_roundss_u32<const ROUNDING: i32>(a: __m128) -> u32 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2si))]
pub unsafe fn _mm_cvtss_i32(a: __m128) -> i32 {
- transmute(vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+ vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
@@ -37229,7 +37147,7 @@ pub unsafe fn _mm_cvtss_i32(a: __m128) -> i32 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2usi))]
pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
- transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+ vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
@@ -37248,8 +37166,7 @@ pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
pub unsafe fn _mm_cvt_roundsd_si32<const ROUNDING: i32>(a: __m128d) -> i32 {
static_assert_rounding!(ROUNDING);
let a = a.as_f64x2();
- let r = vcvtsd2si(a, ROUNDING);
- transmute(r)
+ vcvtsd2si(a, ROUNDING)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
@@ -37268,8 +37185,7 @@ pub unsafe fn _mm_cvt_roundsd_si32<const ROUNDING: i32>(a: __m128d) -> i32 {
pub unsafe fn _mm_cvt_roundsd_i32<const ROUNDING: i32>(a: __m128d) -> i32 {
static_assert_rounding!(ROUNDING);
let a = a.as_f64x2();
- let r = vcvtsd2si(a, ROUNDING);
- transmute(r)
+ vcvtsd2si(a, ROUNDING)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
@@ -37288,8 +37204,7 @@ pub unsafe fn _mm_cvt_roundsd_i32<const ROUNDING: i32>(a: __m128d) -> i32 {
pub unsafe fn _mm_cvt_roundsd_u32<const ROUNDING: i32>(a: __m128d) -> u32 {
static_assert_rounding!(ROUNDING);
let a = a.as_f64x2();
- let r = vcvtsd2usi(a, ROUNDING);
- transmute(r)
+ vcvtsd2usi(a, ROUNDING)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
@@ -37299,7 +37214,7 @@ pub unsafe fn _mm_cvt_roundsd_u32<const ROUNDING: i32>(a: __m128d) -> u32 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2si))]
pub unsafe fn _mm_cvtsd_i32(a: __m128d) -> i32 {
- transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+ vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
@@ -37309,7 +37224,7 @@ pub unsafe fn _mm_cvtsd_i32(a: __m128d) -> i32 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2usi))]
pub unsafe fn _mm_cvtsd_u32(a: __m128d) -> u32 {
- transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+ vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -37382,8 +37297,7 @@ pub unsafe fn _mm_cvt_roundu32_ss<const ROUNDING: i32>(a: __m128, b: u32) -> __m
#[cfg_attr(test, assert_instr(vcvtsi2ss))]
pub unsafe fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
let b = b as f32;
- let r = simd_insert(a, 0, b);
- transmute(r)
+ simd_insert(a, 0, b)
}
/// Convert the signed 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -37394,8 +37308,7 @@ pub unsafe fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
#[cfg_attr(test, assert_instr(vcvtsi2sd))]
pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
let b = b as f64;
- let r = simd_insert(a, 0, b);
- transmute(r)
+ simd_insert(a, 0, b)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -37409,8 +37322,7 @@ pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
pub unsafe fn _mm_cvtt_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
static_assert_sae!(SAE);
let a = a.as_f32x4();
- let r = vcvtss2si(a, SAE);
- transmute(r)
+ vcvtss2si(a, SAE)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -37424,8 +37336,7 @@ pub unsafe fn _mm_cvtt_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
pub unsafe fn _mm_cvtt_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
static_assert_sae!(SAE);
let a = a.as_f32x4();
- let r = vcvtss2si(a, SAE);
- transmute(r)
+ vcvtss2si(a, SAE)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
@@ -37439,8 +37350,7 @@ pub unsafe fn _mm_cvtt_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
pub unsafe fn _mm_cvtt_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
static_assert_sae!(SAE);
let a = a.as_f32x4();
- let r = vcvtss2usi(a, SAE);
- transmute(r)
+ vcvtss2usi(a, SAE)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
@@ -37450,7 +37360,7 @@ pub unsafe fn _mm_cvtt_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2si))]
pub unsafe fn _mm_cvttss_i32(a: __m128) -> i32 {
- transmute(vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+ vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
@@ -37460,7 +37370,7 @@ pub unsafe fn _mm_cvttss_i32(a: __m128) -> i32 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2usi))]
pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 {
- transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+ vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -37474,8 +37384,7 @@ pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 {
pub unsafe fn _mm_cvtt_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
static_assert_sae!(SAE);
let a = a.as_f64x2();
- let r = vcvtsd2si(a, SAE);
- transmute(r)
+ vcvtsd2si(a, SAE)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -37489,8 +37398,7 @@ pub unsafe fn _mm_cvtt_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
pub unsafe fn _mm_cvtt_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
static_assert_sae!(SAE);
let a = a.as_f64x2();
- let r = vcvtsd2si(a, SAE);
- transmute(r)
+ vcvtsd2si(a, SAE)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
@@ -37504,8 +37412,7 @@ pub unsafe fn _mm_cvtt_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
pub unsafe fn _mm_cvtt_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
static_assert_sae!(SAE);
let a = a.as_f64x2();
- let r = vcvtsd2usi(a, SAE);
- transmute(r)
+ vcvtsd2usi(a, SAE)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
@@ -37515,7 +37422,7 @@ pub unsafe fn _mm_cvtt_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2si))]
pub unsafe fn _mm_cvttsd_i32(a: __m128d) -> i32 {
- transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+ vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
@@ -37525,7 +37432,7 @@ pub unsafe fn _mm_cvttsd_i32(a: __m128d) -> i32 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2usi))]
pub unsafe fn _mm_cvttsd_u32(a: __m128d) -> u32 {
- transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+ vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -37536,8 +37443,7 @@ pub unsafe fn _mm_cvttsd_u32(a: __m128d) -> u32 {
#[cfg_attr(test, assert_instr(vcvtusi2ss))]
pub unsafe fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
let b = b as f32;
- let r = simd_insert(a, 0, b);
- transmute(r)
+ simd_insert(a, 0, b)
}
/// Convert the unsigned 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -37548,8 +37454,7 @@ pub unsafe fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
#[cfg_attr(test, assert_instr(vcvtusi2sd))]
pub unsafe fn _mm_cvtu32_sd(a: __m128d, b: u32) -> __m128d {
let b = b as f64;
- let r = simd_insert(a, 0, b);
- transmute(r)
+ simd_insert(a, 0, b)
}
/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
@@ -37565,8 +37470,7 @@ pub unsafe fn _mm_comi_round_ss<const IMM5: i32, const SAE: i32>(a: __m128, b: _
static_assert_mantissas_sae!(SAE);
let a = a.as_f32x4();
let b = b.as_f32x4();
- let r = vcomiss(a, b, IMM5, SAE);
- transmute(r)
+ vcomiss(a, b, IMM5, SAE)
}
/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
@@ -37582,8 +37486,7 @@ pub unsafe fn _mm_comi_round_sd<const IMM5: i32, const SAE: i32>(a: __m128d, b:
static_assert_mantissas_sae!(SAE);
let a = a.as_f64x2();
let b = b.as_f64x2();
- let r = vcomisd(a, b, IMM5, SAE);
- transmute(r)
+ vcomisd(a, b, IMM5, SAE)
}
/// Equal
diff --git a/library/stdarch/crates/core_arch/src/x86/sse.rs b/library/stdarch/crates/core_arch/src/x86/sse.rs
index 3d4471ba3..6a2be0921 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse.rs
@@ -790,8 +790,7 @@ pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
///
/// The result is rounded according to the current rounding mode. If the result
/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
-/// (`i32::MIN`) or an invalid operation floating point exception if
-/// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
+/// (`i32::MIN`).
///
/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
///
@@ -821,8 +820,7 @@ pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
///
/// The result is rounded always using truncation (round towards zero). If the
/// result cannot be represented as a 32 bit integer the result will be
-/// `0x8000_0000` (`i32::MIN`) or an invalid operation floating point
-/// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
+/// `0x8000_0000` (`i32::MIN`).
///
/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
///
@@ -1083,7 +1081,10 @@ pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
#[cfg_attr(test, assert_instr(movmskps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
- movmskps(a)
+ // Propagate the highest bit to the rest, because simd_bitmask
+ // requires all-1 or all-0.
+ let mask: i32x4 = simd_lt(transmute(a), i32x4::splat(0));
+ simd_bitmask::<i32x4, u8>(mask).into()
}
/// Construct a `__m128` with the lowest element read from `p` and the other
@@ -1365,6 +1366,15 @@ pub unsafe fn _mm_sfence() {
/// Gets the unsigned 32-bit value of the MXCSR control and status register.
///
+/// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust
+/// floating-point operations may or may not result in this register getting updated with exception
+/// state, and the register can change between two invocations of this function even when no
+/// floating-point operations appear in the source code (since floating-point operations appearing
+/// earlier or later can be reordered).
+///
+/// If you need to perform some floating-point operations and check whether they raised an
+/// exception, use an inline assembly block for the entire sequence of operations.
+///
/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr)
@@ -1372,6 +1382,10 @@ pub unsafe fn _mm_sfence() {
#[target_feature(enable = "sse")]
#[cfg_attr(test, assert_instr(stmxcsr))]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _mm_getcsr() -> u32 {
let mut result = 0_i32;
stmxcsr(&mut result as *mut _ as *mut i8);
@@ -1401,6 +1415,16 @@ pub unsafe fn _mm_getcsr() -> u32 {
/// * The *denormals-are-zero mode flag* turns all numbers which would be
/// denormalized (exponent bits are all zeros) into zeros.
///
+/// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to
+/// **immediate Undefined Behavior**: Rust assumes that these are always in their default state and
+/// will optimize accordingly. This even applies when the register is altered and later reset to its
+/// original value without any floating-point operations appearing in the source code between those
+/// operations (since floating-point operations appearing earlier or later can be reordered).
+///
+/// If you need to perform some floating-point operations under a different masking flags, rounding
+/// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the
+/// original MXCSR register state before the end of the block.
+///
/// ## Exception Flags
///
/// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
@@ -1509,6 +1533,10 @@ pub unsafe fn _mm_getcsr() -> u32 {
#[target_feature(enable = "sse")]
#[cfg_attr(test, assert_instr(ldmxcsr))]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _mm_setcsr(val: u32) {
ldmxcsr(&val as *const _ as *const i8);
}
@@ -1588,9 +1616,14 @@ pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK)
#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
#[allow(non_snake_case)]
#[target_feature(enable = "sse")]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
_mm_getcsr() & _MM_MASK_MASK
}
@@ -1599,9 +1632,14 @@ pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE)
#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
#[allow(non_snake_case)]
#[target_feature(enable = "sse")]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
_mm_getcsr() & _MM_EXCEPT_MASK
}
@@ -1610,9 +1648,14 @@ pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE)
#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
#[allow(non_snake_case)]
#[target_feature(enable = "sse")]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
_mm_getcsr() & _MM_FLUSH_ZERO_MASK
}
@@ -1621,9 +1664,14 @@ pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE)
#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
#[allow(non_snake_case)]
#[target_feature(enable = "sse")]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
_mm_getcsr() & _MM_ROUND_MASK
}
@@ -1632,9 +1680,14 @@ pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK)
#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
#[allow(non_snake_case)]
#[target_feature(enable = "sse")]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
_mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x)
}
@@ -1643,9 +1696,14 @@ pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE)
#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
#[allow(non_snake_case)]
#[target_feature(enable = "sse")]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
_mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x)
}
@@ -1654,9 +1712,14 @@ pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE)
#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
#[allow(non_snake_case)]
#[target_feature(enable = "sse")]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x;
// println!("setting csr={:x}", val);
@@ -1667,9 +1730,14 @@ pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE)
#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
#[allow(non_snake_case)]
#[target_feature(enable = "sse")]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
_mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x)
}
@@ -1820,8 +1888,6 @@ extern "C" {
fn maxss(a: __m128, b: __m128) -> __m128;
#[link_name = "llvm.x86.sse.max.ps"]
fn maxps(a: __m128, b: __m128) -> __m128;
- #[link_name = "llvm.x86.sse.movmsk.ps"]
- fn movmskps(a: __m128) -> i32;
#[link_name = "llvm.x86.sse.cmp.ps"]
fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
#[link_name = "llvm.x86.sse.comieq.ss"]
@@ -1974,7 +2040,11 @@ mod tests {
let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
let r = _mm_rcp_ss(a);
let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
- assert_eq_m128(r, e);
+ let rel_err = 0.00048828125;
+ assert_approx_eq!(get_m128(r, 0), get_m128(e, 0), 2. * rel_err);
+ for i in 1..4 {
+ assert_eq!(get_m128(r, i), get_m128(e, i));
+ }
}
#[simd_test(enable = "sse")]
@@ -2055,6 +2125,17 @@ mod tests {
let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
let r = _mm_max_ps(a, b);
assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
+
+ // Check SSE-specific semantics for -0.0 handling.
+ let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
+ let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
+ let r1: [u8; 16] = transmute(_mm_max_ps(a, b));
+ let r2: [u8; 16] = transmute(_mm_max_ps(b, a));
+ let a: [u8; 16] = transmute(a);
+ let b: [u8; 16] = transmute(b);
+ assert_eq!(r1, b);
+ assert_eq!(r2, a);
+ assert_ne!(a, b); // sanity check that -0.0 is actually present
}
#[simd_test(enable = "sse")]
@@ -2098,12 +2179,12 @@ mod tests {
let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
- let e: u32x4 = transmute(_mm_setr_ps(transmute(0u32), 2.0, 3.0, 4.0));
+ let e: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0), 2.0, 3.0, 4.0));
assert_eq!(r, e);
let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
- let e2: u32x4 = transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0));
+ let e2: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0xffffffff), 2.0, 3.0, 4.0));
assert_eq!(r2, e2);
}
@@ -2119,15 +2200,15 @@ mod tests {
let d1 = !0u32; // a.extract(0) < d.extract(0)
let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2143,15 +2224,15 @@ mod tests {
let d1 = !0u32; // a.extract(0) <= d.extract(0)
let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2167,15 +2248,15 @@ mod tests {
let d1 = 0u32; // a.extract(0) > d.extract(0)
let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2191,15 +2272,15 @@ mod tests {
let d1 = 0u32; // a.extract(0) >= d.extract(0)
let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2215,15 +2296,15 @@ mod tests {
let d1 = !0u32; // a.extract(0) != d.extract(0)
let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2244,15 +2325,15 @@ mod tests {
let d1 = 0u32; // a.extract(0) >= d.extract(0)
let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2273,15 +2354,15 @@ mod tests {
let d1 = 0u32; // a.extract(0) > d.extract(0)
let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2302,15 +2383,15 @@ mod tests {
let d1 = !0u32; // a.extract(0) <= d.extract(0)
let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2331,15 +2412,15 @@ mod tests {
let d1 = !0u32; // a.extract(0) < d.extract(0)
let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2355,15 +2436,15 @@ mod tests {
let d1 = !0u32; // a.extract(0) ord d.extract(0)
let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2379,15 +2460,15 @@ mod tests {
let d1 = 0u32; // a.extract(0) unord d.extract(0)
let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2766,7 +2847,9 @@ mod tests {
}
}
+ #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions
#[simd_test(enable = "sse")]
+ #[cfg_attr(miri, ignore)] // Uses _mm_setcsr, which is not supported by Miri
unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() {
// If one of the arguments is a quiet NaN `comieq_ss` should signal an
// Invalid Operation Exception while `ucomieq_ss` should not.
@@ -3072,7 +3155,7 @@ mod tests {
let mut p = vals.as_mut_ptr();
if (p as usize) & 0xf != 0 {
- ofs = ((16 - (p as usize)) & 0xf) >> 2;
+ ofs = (16 - ((p as usize) & 0xf)) >> 2;
p = p.add(ofs);
}
@@ -3098,7 +3181,7 @@ mod tests {
// Align p to 16-byte boundary
if (p as usize) & 0xf != 0 {
- ofs = ((16 - (p as usize)) & 0xf) >> 2;
+ ofs = (16 - ((p as usize) & 0xf)) >> 2;
p = p.add(ofs);
}
@@ -3124,7 +3207,7 @@ mod tests {
// Align p to 16-byte boundary
if (p as usize) & 0xf != 0 {
- ofs = ((16 - (p as usize)) & 0xf) >> 2;
+ ofs = (16 - ((p as usize) & 0xf)) >> 2;
p = p.add(ofs);
}
@@ -3186,11 +3269,15 @@ mod tests {
}
#[simd_test(enable = "sse")]
+ // Miri cannot support this until it is clear how it fits in the Rust memory model
+ #[cfg_attr(miri, ignore)]
unsafe fn test_mm_sfence() {
_mm_sfence();
}
+ #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
#[simd_test(enable = "sse")]
+ #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
unsafe fn test_mm_getcsr_setcsr_1() {
let saved_csr = _mm_getcsr();
@@ -3206,7 +3293,9 @@ mod tests {
assert_eq_m128(r, exp); // first component is a denormalized f32
}
+ #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
#[simd_test(enable = "sse")]
+ #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
unsafe fn test_mm_getcsr_setcsr_2() {
// Same as _mm_setcsr_1 test, but with opposite flag value.
@@ -3224,7 +3313,9 @@ mod tests {
assert_eq_m128(r, exp); // first component is a denormalized f32
}
+ #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
#[simd_test(enable = "sse")]
+ #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
unsafe fn test_mm_getcsr_setcsr_underflow() {
_MM_SET_EXCEPTION_STATE(0);
@@ -3263,6 +3354,9 @@ mod tests {
}
#[simd_test(enable = "sse")]
+ // Miri cannot support this until it is clear how it fits in the Rust memory model
+ // (non-temporal store)
+ #[cfg_attr(miri, ignore)]
unsafe fn test_mm_stream_ps() {
let a = _mm_set1_ps(7.0);
let mut mem = Memory { data: [-1.0; 4] };
diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs
index 3d572a1f5..7831ea743 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs
@@ -165,7 +165,10 @@ pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pavgb))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
- transmute(pavgb(a.as_u8x16(), b.as_u8x16()))
+ let a = simd_cast::<_, u16x16>(a.as_u8x16());
+ let b = simd_cast::<_, u16x16>(b.as_u8x16());
+ let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
+ transmute(simd_cast::<_, u8x16>(r))
}
/// Averages packed unsigned 16-bit integers in `a` and `b`.
@@ -176,7 +179,10 @@ pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pavgw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
- transmute(pavgw(a.as_u16x8(), b.as_u16x8()))
+ let a = simd_cast::<_, u32x8>(a.as_u16x8());
+ let b = simd_cast::<_, u32x8>(b.as_u16x8());
+ let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
+ transmute(simd_cast::<_, u16x8>(r))
}
/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
@@ -261,7 +267,10 @@ pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pmulhw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
- transmute(pmulhw(a.as_i16x8(), b.as_i16x8()))
+ let a = simd_cast::<_, i32x8>(a.as_i16x8());
+ let b = simd_cast::<_, i32x8>(b.as_i16x8());
+ let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
+ transmute(simd_cast::<i32x8, i16x8>(r))
}
/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
@@ -275,7 +284,10 @@ pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pmulhuw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
- transmute(pmulhuw(a.as_u16x8(), b.as_u16x8()))
+ let a = simd_cast::<_, u32x8>(a.as_u16x8());
+ let b = simd_cast::<_, u32x8>(b.as_u16x8());
+ let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
+ transmute(simd_cast::<u32x8, u16x8>(r))
}
/// Multiplies the packed 16-bit integers in `a` and `b`.
@@ -303,7 +315,10 @@ pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pmuludq))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
- transmute(pmuludq(a.as_u32x4(), b.as_u32x4()))
+ let a = a.as_u64x2();
+ let b = b.as_u64x2();
+ let mask = u64x2::splat(u32::MAX.into());
+ transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
}
/// Sum the absolute differences of packed unsigned 8-bit integers.
@@ -952,7 +967,7 @@ pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
#[cfg_attr(test, assert_instr(cvtdq2ps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
- cvtdq2ps(a.as_i32x4())
+ transmute(simd_cast::<_, f32x4>(a.as_i32x4()))
}
/// Converts packed single-precision (32-bit) floating-point elements in `a`
@@ -2240,7 +2255,9 @@ pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
#[cfg_attr(test, assert_instr(cvtpd2ps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
- cvtpd2ps(a)
+ let r = simd_cast::<_, f32x2>(a.as_f64x2());
+ let zero = f32x2::new(0.0, 0.0);
+ transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
}
/// Converts packed single-precision (32-bit) floating-point elements in `a` to
@@ -2253,7 +2270,8 @@ pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
#[cfg_attr(test, assert_instr(cvtps2pd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtps_pd(a: __m128) -> __m128d {
- cvtps2pd(a)
+ let a = a.as_f32x4();
+ transmute(simd_cast::<f32x2, f64x2>(simd_shuffle!(a, a, [0, 1])))
}
/// Converts packed double-precision (64-bit) floating-point elements in `a` to
@@ -2432,7 +2450,10 @@ pub unsafe fn _mm_setzero_pd() -> __m128d {
#[cfg_attr(test, assert_instr(movmskpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 {
- movmskpd(a)
+ // Propagate the highest bit to the rest, because simd_bitmask
+ // requires all-1 or all-0.
+ let mask: i64x2 = simd_lt(transmute(a), i64x2::splat(0));
+ simd_bitmask::<i64x2, u8>(mask).into()
}
/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
@@ -2826,18 +2847,8 @@ extern "C" {
fn lfence();
#[link_name = "llvm.x86.sse2.mfence"]
fn mfence();
- #[link_name = "llvm.x86.sse2.pavg.b"]
- fn pavgb(a: u8x16, b: u8x16) -> u8x16;
- #[link_name = "llvm.x86.sse2.pavg.w"]
- fn pavgw(a: u16x8, b: u16x8) -> u16x8;
#[link_name = "llvm.x86.sse2.pmadd.wd"]
fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
- #[link_name = "llvm.x86.sse2.pmulh.w"]
- fn pmulhw(a: i16x8, b: i16x8) -> i16x8;
- #[link_name = "llvm.x86.sse2.pmulhu.w"]
- fn pmulhuw(a: u16x8, b: u16x8) -> u16x8;
- #[link_name = "llvm.x86.sse2.pmulu.dq"]
- fn pmuludq(a: u32x4, b: u32x4) -> u64x2;
#[link_name = "llvm.x86.sse2.psad.bw"]
fn psadbw(a: u8x16, b: u8x16) -> u64x2;
#[link_name = "llvm.x86.sse2.psll.w"]
@@ -2856,8 +2867,6 @@ extern "C" {
fn psrld(a: i32x4, count: i32x4) -> i32x4;
#[link_name = "llvm.x86.sse2.psrl.q"]
fn psrlq(a: i64x2, count: i64x2) -> i64x2;
- #[link_name = "llvm.x86.sse2.cvtdq2ps"]
- fn cvtdq2ps(a: i32x4) -> __m128;
#[link_name = "llvm.x86.sse2.cvtps2dq"]
fn cvtps2dq(a: __m128) -> i32x4;
#[link_name = "llvm.x86.sse2.maskmov.dqu"]
@@ -2908,12 +2917,6 @@ extern "C" {
fn ucomigesd(a: __m128d, b: __m128d) -> i32;
#[link_name = "llvm.x86.sse2.ucomineq.sd"]
fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
- #[link_name = "llvm.x86.sse2.movmsk.pd"]
- fn movmskpd(a: __m128d) -> i32;
- #[link_name = "llvm.x86.sse2.cvtpd2ps"]
- fn cvtpd2ps(a: __m128d) -> __m128;
- #[link_name = "llvm.x86.sse2.cvtps2pd"]
- fn cvtps2pd(a: __m128) -> __m128d;
#[link_name = "llvm.x86.sse2.cvtpd2dq"]
fn cvtpd2dq(a: __m128d) -> i32x4;
#[link_name = "llvm.x86.sse2.cvtsd2si"]
@@ -2956,11 +2959,15 @@ mod tests {
}
#[simd_test(enable = "sse2")]
+ // Miri cannot support this until it is clear how it fits in the Rust memory model
+ #[cfg_attr(miri, ignore)]
unsafe fn test_mm_lfence() {
_mm_lfence();
}
#[simd_test(enable = "sse2")]
+ // Miri cannot support this until it is clear how it fits in the Rust memory model
+ #[cfg_attr(miri, ignore)]
unsafe fn test_mm_mfence() {
_mm_mfence();
}
@@ -3343,83 +3350,124 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_slli_epi16() {
- #[rustfmt::skip]
- let a = _mm_setr_epi16(
- 0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0,
- );
+ let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
let r = _mm_slli_epi16::<4>(a);
-
- #[rustfmt::skip]
- let e = _mm_setr_epi16(
- 0xFFF0 as u16 as i16, 0xFFF0 as u16 as i16, 0x0FF0, 0x00F0,
- 0, 0, 0, 0,
+ assert_eq_m128i(
+ r,
+ _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
);
- assert_eq_m128i(r, e);
+ let r = _mm_slli_epi16::<16>(a);
+ assert_eq_m128i(r, _mm_set1_epi16(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_sll_epi16() {
- let a = _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0);
- let r = _mm_sll_epi16(a, _mm_setr_epi16(4, 0, 0, 0, 0, 0, 0, 0));
- assert_eq_m128i(r, _mm_setr_epi16(0xFF0, 0, 0, 0, 0, 0, 0, 0));
- let r = _mm_sll_epi16(a, _mm_setr_epi16(0, 0, 0, 0, 4, 0, 0, 0));
- assert_eq_m128i(r, _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0));
+ let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+ let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
+ assert_eq_m128i(
+ r,
+ _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
+ );
+ let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
+ assert_eq_m128i(r, a);
+ let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
+ assert_eq_m128i(r, _mm_set1_epi16(0));
+ let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
+ assert_eq_m128i(r, _mm_set1_epi16(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_slli_epi32() {
- let r = _mm_slli_epi32::<4>(_mm_set1_epi32(0xFFFF));
- assert_eq_m128i(r, _mm_set1_epi32(0xFFFF0));
+ let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+ let r = _mm_slli_epi32::<4>(a);
+ assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
+ let r = _mm_slli_epi32::<32>(a);
+ assert_eq_m128i(r, _mm_set1_epi32(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_sll_epi32() {
- let a = _mm_set1_epi32(0xFFFF);
- let b = _mm_setr_epi32(4, 0, 0, 0);
- let r = _mm_sll_epi32(a, b);
- assert_eq_m128i(r, _mm_set1_epi32(0xFFFF0));
+ let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+ let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
+ assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
+ let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
+ assert_eq_m128i(r, a);
+ let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
+ assert_eq_m128i(r, _mm_set1_epi32(0));
+ let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
+ assert_eq_m128i(r, _mm_set1_epi32(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_slli_epi64() {
- let r = _mm_slli_epi64::<4>(_mm_set1_epi64x(0xFFFFFFFF));
- assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFFF0));
+ let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+ let r = _mm_slli_epi64::<4>(a);
+ assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
+ let r = _mm_slli_epi64::<64>(a);
+ assert_eq_m128i(r, _mm_set1_epi64x(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_sll_epi64() {
- let a = _mm_set1_epi64x(0xFFFFFFFF);
- let b = _mm_setr_epi64x(4, 0);
- let r = _mm_sll_epi64(a, b);
- assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFFF0));
+ let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+ let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
+ assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
+ let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
+ assert_eq_m128i(r, a);
+ let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
+ assert_eq_m128i(r, _mm_set1_epi64x(0));
+ let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
+ assert_eq_m128i(r, _mm_set1_epi64x(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_srai_epi16() {
- let r = _mm_srai_epi16::<1>(_mm_set1_epi16(-1));
- assert_eq_m128i(r, _mm_set1_epi16(-1));
+ let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+ let r = _mm_srai_epi16::<4>(a);
+ assert_eq_m128i(
+ r,
+ _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
+ );
+ let r = _mm_srai_epi16::<16>(a);
+ assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_sra_epi16() {
- let a = _mm_set1_epi16(-1);
- let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
- let r = _mm_sra_epi16(a, b);
- assert_eq_m128i(r, _mm_set1_epi16(-1));
+ let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+ let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
+ assert_eq_m128i(
+ r,
+ _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
+ );
+ let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
+ assert_eq_m128i(r, a);
+ let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
+ assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
+ let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
+ assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_srai_epi32() {
- let r = _mm_srai_epi32::<1>(_mm_set1_epi32(-1));
- assert_eq_m128i(r, _mm_set1_epi32(-1));
+ let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+ let r = _mm_srai_epi32::<4>(a);
+ assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
+ let r = _mm_srai_epi32::<32>(a);
+ assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_sra_epi32() {
- let a = _mm_set1_epi32(-1);
- let b = _mm_setr_epi32(1, 0, 0, 0);
- let r = _mm_sra_epi32(a, b);
- assert_eq_m128i(r, _mm_set1_epi32(-1));
+ let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+ let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
+ assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
+ let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
+ assert_eq_m128i(r, a);
+ let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
+ assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
+ let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
+ assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
}
#[simd_test(enable = "sse2")]
@@ -3453,53 +3501,74 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_srli_epi16() {
- #[rustfmt::skip]
- let a = _mm_setr_epi16(
- 0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0,
- );
+ let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
let r = _mm_srli_epi16::<4>(a);
- #[rustfmt::skip]
- let e = _mm_setr_epi16(
- 0xFFF as u16 as i16, 0xFF as u16 as i16, 0xF, 0, 0, 0, 0, 0,
+ assert_eq_m128i(
+ r,
+ _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
);
- assert_eq_m128i(r, e);
+ let r = _mm_srli_epi16::<16>(a);
+ assert_eq_m128i(r, _mm_set1_epi16(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_srl_epi16() {
- let a = _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0);
- let r = _mm_srl_epi16(a, _mm_setr_epi16(4, 0, 0, 0, 0, 0, 0, 0));
- assert_eq_m128i(r, _mm_setr_epi16(0xF, 0, 0, 0, 0, 0, 0, 0));
- let r = _mm_srl_epi16(a, _mm_setr_epi16(0, 0, 0, 0, 4, 0, 0, 0));
- assert_eq_m128i(r, _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0));
+ let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+ let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
+ assert_eq_m128i(
+ r,
+ _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
+ );
+ let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
+ assert_eq_m128i(r, a);
+ let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
+ assert_eq_m128i(r, _mm_set1_epi16(0));
+ let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
+ assert_eq_m128i(r, _mm_set1_epi16(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_srli_epi32() {
- let r = _mm_srli_epi32::<4>(_mm_set1_epi32(0xFFFF));
- assert_eq_m128i(r, _mm_set1_epi32(0xFFF));
+ let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+ let r = _mm_srli_epi32::<4>(a);
+ assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
+ let r = _mm_srli_epi32::<32>(a);
+ assert_eq_m128i(r, _mm_set1_epi32(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_srl_epi32() {
- let a = _mm_set1_epi32(0xFFFF);
- let b = _mm_setr_epi32(4, 0, 0, 0);
- let r = _mm_srl_epi32(a, b);
- assert_eq_m128i(r, _mm_set1_epi32(0xFFF));
+ let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+ let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
+ assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
+ let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
+ assert_eq_m128i(r, a);
+ let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
+ assert_eq_m128i(r, _mm_set1_epi32(0));
+ let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
+ assert_eq_m128i(r, _mm_set1_epi32(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_srli_epi64() {
- let r = _mm_srli_epi64::<4>(_mm_set1_epi64x(0xFFFFFFFF));
- assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFF));
+ let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+ let r = _mm_srli_epi64::<4>(a);
+ assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
+ let r = _mm_srli_epi64::<64>(a);
+ assert_eq_m128i(r, _mm_set1_epi64x(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_srl_epi64() {
- let a = _mm_set1_epi64x(0xFFFFFFFF);
- let b = _mm_setr_epi64x(4, 0);
- let r = _mm_srl_epi64(a, b);
- assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFF));
+ let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+ let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
+ assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
+ let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
+ assert_eq_m128i(r, a);
+ let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
+ assert_eq_m128i(r, _mm_set1_epi64x(0));
+ let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
+ assert_eq_m128i(r, _mm_set1_epi64x(0));
}
#[simd_test(enable = "sse2")]
@@ -3766,6 +3835,9 @@ mod tests {
}
#[simd_test(enable = "sse2")]
+ // Miri cannot support this until it is clear how it fits in the Rust memory model
+ // (non-temporal store)
+ #[cfg_attr(miri, ignore)]
unsafe fn test_mm_maskmoveu_si128() {
let a = _mm_set1_epi8(9);
#[rustfmt::skip]
@@ -3804,6 +3876,9 @@ mod tests {
}
#[simd_test(enable = "sse2")]
+ // Miri cannot support this until it is clear how it fits in the Rust memory model
+ // (non-temporal store)
+ #[cfg_attr(miri, ignore)]
unsafe fn test_mm_stream_si128() {
let a = _mm_setr_epi32(1, 2, 3, 4);
let mut r = _mm_undefined_si128();
@@ -3812,6 +3887,9 @@ mod tests {
}
#[simd_test(enable = "sse2")]
+ // Miri cannot support this until it is clear how it fits in the Rust memory model
+ // (non-temporal store)
+ #[cfg_attr(miri, ignore)]
unsafe fn test_mm_stream_si32() {
let a: i32 = 7;
let mut mem = boxed::Box::<i32>::new(-1);
@@ -4055,6 +4133,17 @@ mod tests {
let b = _mm_setr_pd(5.0, 10.0);
let r = _mm_max_pd(a, b);
assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
+
+ // Check SSE(2)-specific semantics for -0.0 handling.
+ let a = _mm_setr_pd(-0.0, 0.0);
+ let b = _mm_setr_pd(0.0, 0.0);
+ let r1: [u8; 16] = transmute(_mm_max_pd(a, b));
+ let r2: [u8; 16] = transmute(_mm_max_pd(b, a));
+ let a: [u8; 16] = transmute(a);
+ let b: [u8; 16] = transmute(b);
+ assert_eq!(r1, b);
+ assert_eq!(r2, a);
+ assert_ne!(a, b); // sanity check that -0.0 is actually present
}
#[simd_test(enable = "sse2")]
@@ -4071,6 +4160,17 @@ mod tests {
let b = _mm_setr_pd(5.0, 10.0);
let r = _mm_min_pd(a, b);
assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
+
+ // Check SSE(2)-specific semantics for -0.0 handling.
+ let a = _mm_setr_pd(-0.0, 0.0);
+ let b = _mm_setr_pd(0.0, 0.0);
+ let r1: [u8; 16] = transmute(_mm_min_pd(a, b));
+ let r2: [u8; 16] = transmute(_mm_min_pd(b, a));
+ let a: [u8; 16] = transmute(a);
+ let b: [u8; 16] = transmute(b);
+ assert_eq!(r1, b);
+ assert_eq!(r2, a);
+ assert_ne!(a, b); // sanity check that -0.0 is actually present
}
#[simd_test(enable = "sse2")]
@@ -4158,7 +4258,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpeq_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
- let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4166,7 +4266,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmplt_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
- let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4174,7 +4274,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmple_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
- let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4182,7 +4282,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpgt_sd() {
let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
- let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4190,7 +4290,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpge_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
- let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4198,7 +4298,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpord_sd() {
let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
- let e = _mm_setr_epi64x(0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4206,7 +4306,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpunord_sd() {
let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
- let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4214,7 +4314,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpneq_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
- let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4222,7 +4322,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpnlt_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
- let e = _mm_setr_epi64x(0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4230,7 +4330,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpnle_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
- let e = _mm_setr_epi64x(0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4238,7 +4338,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpngt_sd() {
let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
- let e = _mm_setr_epi64x(0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4246,7 +4346,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpnge_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
- let e = _mm_setr_epi64x(0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4478,6 +4578,9 @@ mod tests {
}
#[simd_test(enable = "sse2")]
+ // Miri cannot support this until it is clear how it fits in the Rust memory model
+ // (non-temporal store)
+ #[cfg_attr(miri, ignore)]
unsafe fn test_mm_stream_pd() {
#[repr(align(128))]
struct Memory {
diff --git a/library/stdarch/crates/core_arch/src/x86/sse3.rs b/library/stdarch/crates/core_arch/src/x86/sse3.rs
index 092a8d9cd..df0d78e5b 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse3.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse3.rs
@@ -1,7 +1,7 @@
//! Streaming SIMD Extensions 3 (SSE3)
use crate::{
- core_arch::{simd::*, simd_llvm::simd_shuffle, x86::*},
+ core_arch::{simd::*, simd_llvm::*, x86::*},
mem::transmute,
};
@@ -17,7 +17,11 @@ use stdarch_test::assert_instr;
#[cfg_attr(test, assert_instr(addsubps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
- addsubps(a, b)
+ let a = a.as_f32x4();
+ let b = b.as_f32x4();
+ let add = simd_add(a, b);
+ let sub = simd_sub(a, b);
+ simd_shuffle!(add, sub, [4, 1, 6, 3])
}
/// Alternatively add and subtract packed double-precision (64-bit)
@@ -29,7 +33,11 @@ pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
#[cfg_attr(test, assert_instr(addsubpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
- addsubpd(a, b)
+ let a = a.as_f64x2();
+ let b = b.as_f64x2();
+ let add = simd_add(a, b);
+ let sub = simd_sub(a, b);
+ simd_shuffle!(add, sub, [2, 1])
}
/// Horizontally adds adjacent pairs of double-precision (64-bit)
@@ -143,10 +151,6 @@ pub unsafe fn _mm_moveldup_ps(a: __m128) -> __m128 {
#[allow(improper_ctypes)]
extern "C" {
- #[link_name = "llvm.x86.sse3.addsub.ps"]
- fn addsubps(a: __m128, b: __m128) -> __m128;
- #[link_name = "llvm.x86.sse3.addsub.pd"]
- fn addsubpd(a: __m128d, b: __m128d) -> __m128d;
#[link_name = "llvm.x86.sse3.hadd.pd"]
fn haddpd(a: __m128d, b: __m128d) -> __m128d;
#[link_name = "llvm.x86.sse3.hadd.ps"]
diff --git a/library/stdarch/crates/core_arch/src/x86/sse41.rs b/library/stdarch/crates/core_arch/src/x86/sse41.rs
index 7ba86e5f7..6d33238b0 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse41.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse41.rs
@@ -62,7 +62,8 @@ pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTI
#[cfg_attr(test, assert_instr(pblendvb))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
- transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
+ let mask: i8x16 = simd_lt(mask.as_i8x16(), i8x16::splat(0));
+ transmute(simd_select(mask, b.as_i8x16(), a.as_i8x16()))
}
/// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
@@ -74,15 +75,25 @@ pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16)
#[inline]
#[target_feature(enable = "sse4.1")]
-// Note: LLVM7 prefers the single-precision floating-point domain when possible
-// see https://bugs.llvm.org/show_bug.cgi?id=38195
-// #[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xF0))]
-#[cfg_attr(test, assert_instr(blendps, IMM8 = 0xF0))]
+#[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xB1))]
#[rustc_legacy_const_generics(2)]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
static_assert_uimm_bits!(IMM8, 8);
- transmute(pblendw(a.as_i16x8(), b.as_i16x8(), IMM8 as u8))
+ transmute::<i16x8, _>(simd_shuffle!(
+ a.as_i16x8(),
+ b.as_i16x8(),
+ [
+ [0, 8][IMM8 as usize & 1],
+ [1, 9][(IMM8 >> 1) as usize & 1],
+ [2, 10][(IMM8 >> 2) as usize & 1],
+ [3, 11][(IMM8 >> 3) as usize & 1],
+ [4, 12][(IMM8 >> 4) as usize & 1],
+ [5, 13][(IMM8 >> 5) as usize & 1],
+ [6, 14][(IMM8 >> 6) as usize & 1],
+ [7, 15][(IMM8 >> 7) as usize & 1],
+ ]
+ ))
}
/// Blend packed double-precision (64-bit) floating-point elements from `a`
@@ -94,7 +105,8 @@ pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128
#[cfg_attr(test, assert_instr(blendvpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
- blendvpd(a, b, mask)
+ let mask: i64x2 = simd_lt(transmute::<_, i64x2>(mask), i64x2::splat(0));
+ transmute(simd_select(mask, b.as_f64x2(), a.as_f64x2()))
}
/// Blend packed single-precision (32-bit) floating-point elements from `a`
@@ -106,7 +118,8 @@ pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
#[cfg_attr(test, assert_instr(blendvps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
- blendvps(a, b, mask)
+ let mask: i32x4 = simd_lt(transmute::<_, i32x4>(mask), i32x4::splat(0));
+ transmute(simd_select(mask, b.as_f32x4(), a.as_f32x4()))
}
/// Blend packed double-precision (64-bit) floating-point elements from `a`
@@ -123,7 +136,11 @@ pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
static_assert_uimm_bits!(IMM2, 2);
- blendpd(a, b, IMM2 as u8)
+ transmute::<f64x2, _>(simd_shuffle!(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ [[0, 2][IMM2 as usize & 1], [1, 3][(IMM2 >> 1) as usize & 1]]
+ ))
}
/// Blend packed single-precision (32-bit) floating-point elements from `a`
@@ -137,7 +154,16 @@ pub unsafe fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
static_assert_uimm_bits!(IMM4, 4);
- blendps(a, b, IMM4 as u8)
+ transmute::<f32x4, _>(simd_shuffle!(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ [
+ [0, 4][IMM4 as usize & 1],
+ [1, 5][(IMM4 >> 1) as usize & 1],
+ [2, 6][(IMM4 >> 2) as usize & 1],
+ [3, 7][(IMM4 >> 3) as usize & 1],
+ ]
+ ))
}
/// Extracts a single-precision (32-bit) floating-point element from `a`,
@@ -175,7 +201,7 @@ pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
static_assert_uimm_bits!(IMM8, 2);
- transmute(simd_extract::<_, f32>(a, IMM8 as u32))
+ simd_extract::<_, f32>(a, IMM8 as u32).to_bits() as i32
}
/// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
@@ -923,7 +949,9 @@ pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pmuldq))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
- transmute(pmuldq(a.as_i32x4(), b.as_i32x4()))
+ let a = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2()));
+ let b = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2()));
+ transmute(simd_mul(a, b))
}
/// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
@@ -1124,18 +1152,6 @@ pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
#[allow(improper_ctypes)]
extern "C" {
- #[link_name = "llvm.x86.sse41.pblendvb"]
- fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
- #[link_name = "llvm.x86.sse41.blendvpd"]
- fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d;
- #[link_name = "llvm.x86.sse41.blendvps"]
- fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128;
- #[link_name = "llvm.x86.sse41.blendpd"]
- fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d;
- #[link_name = "llvm.x86.sse41.blendps"]
- fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128;
- #[link_name = "llvm.x86.sse41.pblendw"]
- fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
#[link_name = "llvm.x86.sse41.insertps"]
fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
#[link_name = "llvm.x86.sse41.packusdw"]
@@ -1154,8 +1170,6 @@ extern "C" {
fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
#[link_name = "llvm.x86.sse41.phminposuw"]
fn phminposuw(a: u16x8) -> u16x8;
- #[link_name = "llvm.x86.sse41.pmuldq"]
- fn pmuldq(a: i32x4, b: i32x4) -> i64x2;
#[link_name = "llvm.x86.sse41.mpsadbw"]
fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
#[link_name = "llvm.x86.sse41.ptestz"]
@@ -1245,9 +1259,9 @@ mod tests {
#[simd_test(enable = "sse4.1")]
unsafe fn test_mm_extract_ps() {
let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
- let r: f32 = transmute(_mm_extract_ps::<1>(a));
+ let r: f32 = f32::from_bits(_mm_extract_ps::<1>(a) as u32);
assert_eq!(r, 1.0);
- let r: f32 = transmute(_mm_extract_ps::<3>(a));
+ let r: f32 = f32::from_bits(_mm_extract_ps::<3>(a) as u32);
assert_eq!(r, 3.0);
}
@@ -1668,6 +1682,7 @@ mod tests {
assert_eq_m128(r, e);
}
+ #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions
#[simd_test(enable = "sse4.1")]
unsafe fn test_mm_round_sd() {
let a = _mm_setr_pd(1.5, 3.5);
@@ -1680,6 +1695,7 @@ mod tests {
assert_eq_m128d(r, e);
}
+ #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions
#[simd_test(enable = "sse4.1")]
unsafe fn test_mm_round_ss() {
let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
diff --git a/library/stdarch/crates/core_arch/src/x86/test.rs b/library/stdarch/crates/core_arch/src/x86/test.rs
index ec4298033..50b2d93be 100644
--- a/library/stdarch/crates/core_arch/src/x86/test.rs
+++ b/library/stdarch/crates/core_arch/src/x86/test.rs
@@ -3,11 +3,13 @@
use crate::core_arch::x86::*;
use std::mem::transmute;
+#[track_caller]
#[target_feature(enable = "sse2")]
pub unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
}
+#[track_caller]
#[target_feature(enable = "sse2")]
pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
@@ -20,6 +22,7 @@ pub unsafe fn get_m128d(a: __m128d, idx: usize) -> f64 {
transmute::<_, [f64; 2]>(a)[idx]
}
+#[track_caller]
#[target_feature(enable = "sse")]
pub unsafe fn assert_eq_m128(a: __m128, b: __m128) {
let r = _mm_cmpeq_ps(a, b);
@@ -40,11 +43,13 @@ pub unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
_mm_set_epi64x(b, a)
}
+#[track_caller]
#[target_feature(enable = "avx")]
pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b))
}
+#[track_caller]
#[target_feature(enable = "avx")]
pub unsafe fn assert_eq_m256d(a: __m256d, b: __m256d) {
let cmp = _mm256_cmp_pd::<_CMP_EQ_OQ>(a, b);
@@ -58,6 +63,7 @@ pub unsafe fn get_m256d(a: __m256d, idx: usize) -> f64 {
transmute::<_, [f64; 4]>(a)[idx]
}
+#[track_caller]
#[target_feature(enable = "avx")]
pub unsafe fn assert_eq_m256(a: __m256, b: __m256) {
let cmp = _mm256_cmp_ps::<_CMP_EQ_OQ>(a, b);
@@ -125,10 +131,12 @@ mod x86_polyfill {
}
pub use self::x86_polyfill::*;
+#[track_caller]
pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) {
assert_eq!(transmute::<_, [i32; 16]>(a), transmute::<_, [i32; 16]>(b))
}
+#[track_caller]
pub unsafe fn assert_eq_m512(a: __m512, b: __m512) {
let cmp = _mm512_cmp_ps_mask::<_CMP_EQ_OQ>(a, b);
if cmp != 0b11111111_11111111 {
@@ -136,6 +144,7 @@ pub unsafe fn assert_eq_m512(a: __m512, b: __m512) {
}
}
+#[track_caller]
pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) {
let cmp = _mm512_cmp_pd_mask::<_CMP_EQ_OQ>(a, b);
if cmp != 0b11111111 {
diff --git a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs
index 68f332767..bace11d13 100644
--- a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs
@@ -33,7 +33,7 @@ pub unsafe fn _mm_cvtss_i64(a: __m128) -> i64 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2usi))]
pub unsafe fn _mm_cvtss_u64(a: __m128) -> u64 {
- transmute(vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+ vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.
@@ -43,7 +43,7 @@ pub unsafe fn _mm_cvtss_u64(a: __m128) -> u64 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2usi))]
pub unsafe fn _mm_cvtsd_u64(a: __m128d) -> u64 {
- transmute(vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+ vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -54,8 +54,7 @@ pub unsafe fn _mm_cvtsd_u64(a: __m128d) -> u64 {
#[cfg_attr(test, assert_instr(vcvtsi2ss))]
pub unsafe fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 {
let b = b as f32;
- let r = simd_insert(a, 0, b);
- transmute(r)
+ simd_insert(a, 0, b)
}
/// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -66,8 +65,7 @@ pub unsafe fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 {
#[cfg_attr(test, assert_instr(vcvtsi2sd))]
pub unsafe fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d {
let b = b as f64;
- let r = simd_insert(a, 0, b);
- transmute(r)
+ simd_insert(a, 0, b)
}
/// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -78,8 +76,7 @@ pub unsafe fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d {
#[cfg_attr(test, assert_instr(vcvtusi2ss))]
pub unsafe fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 {
let b = b as f32;
- let r = simd_insert(a, 0, b);
- transmute(r)
+ simd_insert(a, 0, b)
}
/// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -90,8 +87,7 @@ pub unsafe fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 {
#[cfg_attr(test, assert_instr(vcvtusi2sd))]
pub unsafe fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d {
let b = b as f64;
- let r = simd_insert(a, 0, b);
- transmute(r)
+ simd_insert(a, 0, b)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
@@ -101,7 +97,7 @@ pub unsafe fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2si))]
pub unsafe fn _mm_cvttsd_i64(a: __m128d) -> i64 {
- transmute(vcvtsd2si64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+ vcvtsd2si64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
@@ -111,7 +107,7 @@ pub unsafe fn _mm_cvttsd_i64(a: __m128d) -> i64 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2usi))]
pub unsafe fn _mm_cvttsd_u64(a: __m128d) -> u64 {
- transmute(vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+ vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
@@ -121,7 +117,7 @@ pub unsafe fn _mm_cvttsd_u64(a: __m128d) -> u64 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2si))]
pub unsafe fn _mm_cvttss_i64(a: __m128) -> i64 {
- transmute(vcvtss2si64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+ vcvtss2si64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
@@ -131,7 +127,7 @@ pub unsafe fn _mm_cvttss_i64(a: __m128) -> i64 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2usi))]
pub unsafe fn _mm_cvttss_u64(a: __m128) -> u64 {
- transmute(vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+ vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -270,8 +266,7 @@ pub unsafe fn _mm_cvt_roundu64_ss<const ROUNDING: i32>(a: __m128, b: u64) -> __m
pub unsafe fn _mm_cvt_roundsd_si64<const ROUNDING: i32>(a: __m128d) -> i64 {
static_assert_rounding!(ROUNDING);
let a = a.as_f64x2();
- let r = vcvtsd2si64(a, ROUNDING);
- transmute(r)
+ vcvtsd2si64(a, ROUNDING)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
@@ -290,8 +285,7 @@ pub unsafe fn _mm_cvt_roundsd_si64<const ROUNDING: i32>(a: __m128d) -> i64 {
pub unsafe fn _mm_cvt_roundsd_i64<const ROUNDING: i32>(a: __m128d) -> i64 {
static_assert_rounding!(ROUNDING);
let a = a.as_f64x2();
- let r = vcvtsd2si64(a, ROUNDING);
- transmute(r)
+ vcvtsd2si64(a, ROUNDING)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\
@@ -310,8 +304,7 @@ pub unsafe fn _mm_cvt_roundsd_i64<const ROUNDING: i32>(a: __m128d) -> i64 {
pub unsafe fn _mm_cvt_roundsd_u64<const ROUNDING: i32>(a: __m128d) -> u64 {
static_assert_rounding!(ROUNDING);
let a = a.as_f64x2();
- let r = vcvtsd2usi64(a, ROUNDING);
- transmute(r)
+ vcvtsd2usi64(a, ROUNDING)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
@@ -330,8 +323,7 @@ pub unsafe fn _mm_cvt_roundsd_u64<const ROUNDING: i32>(a: __m128d) -> u64 {
pub unsafe fn _mm_cvt_roundss_si64<const ROUNDING: i32>(a: __m128) -> i64 {
static_assert_rounding!(ROUNDING);
let a = a.as_f32x4();
- let r = vcvtss2si64(a, ROUNDING);
- transmute(r)
+ vcvtss2si64(a, ROUNDING)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
@@ -350,8 +342,7 @@ pub unsafe fn _mm_cvt_roundss_si64<const ROUNDING: i32>(a: __m128) -> i64 {
pub unsafe fn _mm_cvt_roundss_i64<const ROUNDING: i32>(a: __m128) -> i64 {
static_assert_rounding!(ROUNDING);
let a = a.as_f32x4();
- let r = vcvtss2si64(a, ROUNDING);
- transmute(r)
+ vcvtss2si64(a, ROUNDING)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\
@@ -370,8 +361,7 @@ pub unsafe fn _mm_cvt_roundss_i64<const ROUNDING: i32>(a: __m128) -> i64 {
pub unsafe fn _mm_cvt_roundss_u64<const ROUNDING: i32>(a: __m128) -> u64 {
static_assert_rounding!(ROUNDING);
let a = a.as_f32x4();
- let r = vcvtss2usi64(a, ROUNDING);
- transmute(r)
+ vcvtss2usi64(a, ROUNDING)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
@@ -385,8 +375,7 @@ pub unsafe fn _mm_cvt_roundss_u64<const ROUNDING: i32>(a: __m128) -> u64 {
pub unsafe fn _mm_cvtt_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
static_assert_sae!(SAE);
let a = a.as_f64x2();
- let r = vcvtsd2si64(a, SAE);
- transmute(r)
+ vcvtsd2si64(a, SAE)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
@@ -400,8 +389,7 @@ pub unsafe fn _mm_cvtt_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
pub unsafe fn _mm_cvtt_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
static_assert_sae!(SAE);
let a = a.as_f64x2();
- let r = vcvtsd2si64(a, SAE);
- transmute(r)
+ vcvtsd2si64(a, SAE)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\
@@ -415,8 +403,7 @@ pub unsafe fn _mm_cvtt_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
pub unsafe fn _mm_cvtt_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
static_assert_sae!(SAE);
let a = a.as_f64x2();
- let r = vcvtsd2usi64(a, SAE);
- transmute(r)
+ vcvtsd2usi64(a, SAE)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
@@ -430,8 +417,7 @@ pub unsafe fn _mm_cvtt_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
pub unsafe fn _mm_cvtt_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
static_assert_sae!(SAE);
let a = a.as_f32x4();
- let r = vcvtss2si64(a, SAE);
- transmute(r)
+ vcvtss2si64(a, SAE)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
@@ -445,8 +431,7 @@ pub unsafe fn _mm_cvtt_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
pub unsafe fn _mm_cvtt_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
static_assert_sae!(SAE);
let a = a.as_f32x4();
- let r = vcvtss2si64(a, SAE);
- transmute(r)
+ vcvtss2si64(a, SAE)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\
@@ -460,8 +445,7 @@ pub unsafe fn _mm_cvtt_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
pub unsafe fn _mm_cvtt_roundss_u64<const SAE: i32>(a: __m128) -> u64 {
static_assert_sae!(SAE);
let a = a.as_f32x4();
- let r = vcvtss2usi64(a, SAE);
- transmute(r)
+ vcvtss2usi64(a, SAE)
}
#[allow(improper_ctypes)]
diff --git a/library/stdarch/crates/core_arch/src/x86_64/sse2.rs b/library/stdarch/crates/core_arch/src/x86_64/sse2.rs
index bf2394eba..9619cb748 100644
--- a/library/stdarch/crates/core_arch/src/x86_64/sse2.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/sse2.rs
@@ -181,6 +181,9 @@ mod tests {
}
#[simd_test(enable = "sse2")]
+ // Miri cannot support this until it is clear how it fits in the Rust memory model
+ // (non-temporal store)
+ #[cfg_attr(miri, ignore)]
unsafe fn test_mm_stream_si64() {
let a: i64 = 7;
let mut mem = boxed::Box::<i64>::new(-1);
diff --git a/library/stdarch/crates/intrinsic-test/Cargo.toml b/library/stdarch/crates/intrinsic-test/Cargo.toml
index d977dd659..c7a18f77f 100644
--- a/library/stdarch/crates/intrinsic-test/Cargo.toml
+++ b/library/stdarch/crates/intrinsic-test/Cargo.toml
@@ -12,10 +12,10 @@ lazy_static = "1.4.0"
serde = { version = "1", features = ["derive"] }
serde_json = "1.0"
csv = "1.1"
-clap = "2.33.3"
+clap = { version = "4.4", features = ["derive"] }
regex = "1.4.2"
log = "0.4.11"
-pretty_env_logger = "0.4.0"
+pretty_env_logger = "0.5.0"
rayon = "1.5.0"
diff = "0.1.12"
-itertools = "0.10.1"
+itertools = "0.11.0"
diff --git a/library/stdarch/crates/intrinsic-test/README.md b/library/stdarch/crates/intrinsic-test/README.md
index 8a8ddab40..2b3f0c75a 100644
--- a/library/stdarch/crates/intrinsic-test/README.md
+++ b/library/stdarch/crates/intrinsic-test/README.md
@@ -4,15 +4,17 @@ each produces the same result from random inputs.
# Usage
```
USAGE:
- intrinsic-test [OPTIONS] <INPUT>
+ intrinsic-test [FLAGS] [OPTIONS] <INPUT>
FLAGS:
+ --a32 Run tests for A32 instrinsics instead of A64
-h, --help Prints help information
-V, --version Prints version information
OPTIONS:
--cppcompiler <CPPCOMPILER> The C++ compiler to use for compiling the c++ code [default: clang++]
--runner <RUNNER> Run the C programs under emulation with this command
+ --skip <SKIP> Filename for a list of intrinsics to skip (one per line)
--toolchain <TOOLCHAIN> The rust toolchain to use for building the rust code
ARGS:
diff --git a/library/stdarch/crates/intrinsic-test/src/json_parser.rs b/library/stdarch/crates/intrinsic-test/src/json_parser.rs
index bc6fa4a9e..8b3c7869c 100644
--- a/library/stdarch/crates/intrinsic-test/src/json_parser.rs
+++ b/library/stdarch/crates/intrinsic-test/src/json_parser.rs
@@ -1,4 +1,5 @@
use std::collections::HashMap;
+use std::path::Path;
use serde::Deserialize;
@@ -41,7 +42,7 @@ struct JsonIntrinsic {
architectures: Vec<String>,
}
-pub fn get_neon_intrinsics(filename: &str) -> Result<Vec<Intrinsic>, Box<dyn std::error::Error>> {
+pub fn get_neon_intrinsics(filename: &Path) -> Result<Vec<Intrinsic>, Box<dyn std::error::Error>> {
let file = std::fs::File::open(filename)?;
let reader = std::io::BufReader::new(file);
let json: Vec<JsonIntrinsic> = serde_json::from_reader(reader).expect("Couldn't parse JSON");
diff --git a/library/stdarch/crates/intrinsic-test/src/main.rs b/library/stdarch/crates/intrinsic-test/src/main.rs
index 76d2da3ab..15bc021c7 100644
--- a/library/stdarch/crates/intrinsic-test/src/main.rs
+++ b/library/stdarch/crates/intrinsic-test/src/main.rs
@@ -4,9 +4,9 @@ extern crate log;
use std::fs::File;
use std::io::Write;
+use std::path::PathBuf;
use std::process::Command;
-use clap::{App, Arg};
use intrinsic::Intrinsic;
use itertools::Itertools;
use rayon::prelude::*;
@@ -320,58 +320,47 @@ path = "{intrinsic}/main.rs""#,
}
}
+/// Intrinsic test tool
+#[derive(clap::Parser)]
+#[command(
+ name = "Intrinsic test tool",
+ about = "Generates Rust and C programs for intrinsics and compares the output"
+)]
+struct Cli {
+ /// The input file containing the intrinsics
+ input: PathBuf,
+
+ /// The rust toolchain to use for building the rust code
+ #[arg(long)]
+ toolchain: Option<String>,
+
+ /// The C++ compiler to use for compiling the c++ code
+ #[arg(long, default_value_t = String::from("clang++"))]
+ cppcompiler: String,
+
+ /// Run the C programs under emulation with this command
+ #[arg(long)]
+ runner: Option<String>,
+
+ /// Filename for a list of intrinsics to skip (one per line)
+ #[arg(long)]
+ skip: Option<PathBuf>,
+
+ /// Run tests for A32 instrinsics instead of A64
+ #[arg(long)]
+ a32: bool,
+}
+
fn main() {
pretty_env_logger::init();
- let matches = App::new("Intrinsic test tool")
- .about("Generates Rust and C programs for intrinsics and compares the output")
- .arg(
- Arg::with_name("INPUT")
- .help("The input file containing the intrinsics")
- .required(true)
- .index(1),
- )
- .arg(
- Arg::with_name("TOOLCHAIN")
- .takes_value(true)
- .long("toolchain")
- .help("The rust toolchain to use for building the rust code"),
- )
- .arg(
- Arg::with_name("CPPCOMPILER")
- .takes_value(true)
- .default_value("clang++")
- .long("cppcompiler")
- .help("The C++ compiler to use for compiling the c++ code"),
- )
- .arg(
- Arg::with_name("RUNNER")
- .takes_value(true)
- .long("runner")
- .help("Run the C programs under emulation with this command"),
- )
- .arg(
- Arg::with_name("SKIP")
- .takes_value(true)
- .long("skip")
- .help("Filename for a list of intrinsics to skip (one per line)"),
- )
- .arg(
- Arg::with_name("A32")
- .takes_value(false)
- .long("a32")
- .help("Run tests for A32 instrinsics instead of A64"),
- )
- .get_matches();
-
- let filename = matches.value_of("INPUT").unwrap();
- let toolchain = matches
- .value_of("TOOLCHAIN")
- .map_or("".into(), |t| format!("+{t}"));
+ let args: Cli = clap::Parser::parse();
- let cpp_compiler = matches.value_of("CPPCOMPILER").unwrap();
- let c_runner = matches.value_of("RUNNER").unwrap_or("");
- let skip = if let Some(filename) = matches.value_of("SKIP") {
+ let filename = args.input;
+ let toolchain = args.toolchain.map_or_else(String::new, |t| format!("+{t}"));
+ let cpp_compiler = args.cppcompiler;
+ let c_runner = args.runner.unwrap_or_else(String::new);
+ let skip = if let Some(filename) = args.skip {
let data = std::fs::read_to_string(&filename).expect("Failed to open file");
data.lines()
.map(str::trim)
@@ -381,8 +370,8 @@ fn main() {
} else {
Default::default()
};
- let a32 = matches.is_present("A32");
- let mut intrinsics = get_neon_intrinsics(filename).expect("Error parsing input file");
+ let a32 = args.a32;
+ let mut intrinsics = get_neon_intrinsics(&filename).expect("Error parsing input file");
intrinsics.sort_by(|a, b| a.name.cmp(&b.name));
@@ -409,7 +398,7 @@ fn main() {
let notices = build_notices("// ");
- if !build_c(&notices, &intrinsics, cpp_compiler, a32) {
+ if !build_c(&notices, &intrinsics, &cpp_compiler, a32) {
std::process::exit(2);
}
diff --git a/library/stdarch/crates/simd-test-macro/Cargo.toml b/library/stdarch/crates/simd-test-macro/Cargo.toml
index cd110c1d3..c9e692d8e 100644
--- a/library/stdarch/crates/simd-test-macro/Cargo.toml
+++ b/library/stdarch/crates/simd-test-macro/Cargo.toml
@@ -11,3 +11,4 @@ test = false
[dependencies]
proc-macro2 = "1.0"
quote = "1.0"
+syn = { version = "2.0", features = ["full"] }
diff --git a/library/stdarch/crates/simd-test-macro/src/lib.rs b/library/stdarch/crates/simd-test-macro/src/lib.rs
index 2a31dd745..9e089f86b 100644
--- a/library/stdarch/crates/simd-test-macro/src/lib.rs
+++ b/library/stdarch/crates/simd-test-macro/src/lib.rs
@@ -7,7 +7,7 @@
#[macro_use]
extern crate quote;
-use proc_macro2::{Delimiter, Ident, Literal, Span, TokenStream, TokenTree};
+use proc_macro2::{Ident, Literal, Span, TokenStream, TokenTree};
use quote::ToTokens;
use std::env;
@@ -44,13 +44,9 @@ pub fn simd_test(
.collect();
let enable_feature = string(enable_feature);
- let item = TokenStream::from(item);
- let name = find_name(item.clone());
-
- let name: TokenStream = name
- .to_string()
- .parse()
- .unwrap_or_else(|_| panic!("failed to parse name: {}", name.to_string()));
+ let mut item = syn::parse_macro_input!(item as syn::ItemFn);
+ let item_attrs = std::mem::take(&mut item.attrs);
+ let name = &item.sig.ident;
let target = env::var("TARGET").expect(
"TARGET environment variable should be set for rustc (e.g. TARGET=x86_64-apple-darwin cargo test)"
@@ -109,6 +105,7 @@ pub fn simd_test(
#[allow(non_snake_case)]
#[test]
#maybe_ignore
+ #(#item_attrs)*
fn #name() {
if #force_test | (#cfg_target_features) {
let v = unsafe { #name() };
@@ -123,29 +120,3 @@ pub fn simd_test(
};
ret.into()
}
-
-fn find_name(item: TokenStream) -> Ident {
- let mut tokens = item.into_iter();
- while let Some(tok) = tokens.next() {
- if let TokenTree::Ident(word) = tok {
- if word == "fn" {
- break;
- }
- }
- }
-
- fn get_ident(tt: TokenTree) -> Option<Ident> {
- match tt {
- TokenTree::Ident(i) => Some(i),
- TokenTree::Group(g) if g.delimiter() == Delimiter::None => {
- get_ident(g.stream().into_iter().next()?)
- }
- _ => None,
- }
- }
-
- tokens
- .next()
- .and_then(get_ident)
- .expect("failed to find function name")
-}
diff --git a/library/stdarch/crates/std_detect/Cargo.toml b/library/stdarch/crates/std_detect/Cargo.toml
index 589a3900a..12d4a658c 100644
--- a/library/stdarch/crates/std_detect/Cargo.toml
+++ b/library/stdarch/crates/std_detect/Cargo.toml
@@ -30,7 +30,6 @@ compiler_builtins = { version = "0.1.2", optional = true }
alloc = { version = "1.0.0", optional = true, package = "rustc-std-workspace-alloc" }
[dev-dependencies]
-auxv = "0.3.3"
cupid = "0.6.0"
[features]
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs
index 8bc0b30c3..ee46aa1ac 100644
--- a/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs
@@ -19,6 +19,7 @@ pub(crate) const AT_HWCAP2: usize = 26;
/// If an entry cannot be read all the bits in the bitfield are set to zero.
/// This should be interpreted as all the features being disabled.
#[derive(Debug, Copy, Clone)]
+#[cfg_attr(test, derive(PartialEq))]
pub(crate) struct AuxVec {
pub hwcap: usize,
#[cfg(any(
@@ -174,9 +175,12 @@ pub(crate) fn auxv() -> Result<AuxVec, ()> {
/// Tries to read the `key` from the auxiliary vector by calling the
/// dynamically-linked `getauxval` function. If the function is not linked,
/// this function return `Err`.
-#[cfg(all(
- feature = "std_detect_dlsym_getauxval",
- not(all(target_os = "linux", target_env = "gnu"))
+#[cfg(any(
+ test,
+ all(
+ feature = "std_detect_dlsym_getauxval",
+ not(all(target_os = "linux", target_env = "gnu"))
+ )
))]
fn getauxval(key: usize) -> Result<usize, ()> {
use libc;
@@ -262,35 +266,8 @@ fn auxv_from_buf(buf: &[usize]) -> Result<AuxVec, ()> {
#[cfg(test)]
mod tests {
- extern crate auxv as auxv_crate;
use super::*;
- // Reads the Auxiliary Vector key from /proc/self/auxv
- // using the auxv crate.
- #[cfg(feature = "std_detect_file_io")]
- fn auxv_crate_getprocfs(key: usize) -> Option<usize> {
- use self::auxv_crate::procfs::search_procfs_auxv;
- use self::auxv_crate::AuxvType;
- let k = key as AuxvType;
- match search_procfs_auxv(&[k]) {
- Ok(v) => Some(v[&k] as usize),
- Err(_) => None,
- }
- }
-
- // Reads the Auxiliary Vector key from getauxval()
- // using the auxv crate.
- #[cfg(not(any(target_arch = "mips", target_arch = "mips64")))]
- fn auxv_crate_getauxval(key: usize) -> Option<usize> {
- use self::auxv_crate::getauxval::Getauxval;
- use self::auxv_crate::AuxvType;
- let q = auxv_crate::getauxval::NativeGetauxval {};
- match q.getauxval(key as AuxvType) {
- Ok(v) => Some(v as usize),
- Err(_) => None,
- }
- }
-
// FIXME: on mips/mips64 getauxval returns 0, and /proc/self/auxv
// does not always contain the AT_HWCAP key under qemu.
#[cfg(any(
@@ -301,7 +278,7 @@ mod tests {
#[test]
fn auxv_crate() {
let v = auxv();
- if let Some(hwcap) = auxv_crate_getauxval(AT_HWCAP) {
+ if let Ok(hwcap) = getauxval(AT_HWCAP) {
let rt_hwcap = v.expect("failed to find hwcap key").hwcap;
assert_eq!(rt_hwcap, hwcap);
}
@@ -314,7 +291,7 @@ mod tests {
target_arch = "powerpc64"
))]
{
- if let Some(hwcap2) = auxv_crate_getauxval(AT_HWCAP2) {
+ if let Ok(hwcap2) = getauxval(AT_HWCAP2) {
let rt_hwcap2 = v.expect("failed to find hwcap2 key").hwcap2;
assert_eq!(rt_hwcap2, hwcap2);
}
@@ -391,22 +368,8 @@ mod tests {
#[test]
#[cfg(feature = "std_detect_file_io")]
fn auxv_crate_procfs() {
- let v = auxv();
- if let Some(hwcap) = auxv_crate_getprocfs(AT_HWCAP) {
- assert_eq!(v.unwrap().hwcap, hwcap);
- }
-
- // Targets with AT_HWCAP and AT_HWCAP2:
- #[cfg(any(
- target_arch = "aarch64",
- target_arch = "arm",
- target_arch = "powerpc",
- target_arch = "powerpc64"
- ))]
- {
- if let Some(hwcap2) = auxv_crate_getprocfs(AT_HWCAP2) {
- assert_eq!(v.unwrap().hwcap2, hwcap2);
- }
+ if let Ok(procfs_auxv) = auxv_from_file("/proc/self/auxv") {
+ assert_eq!(auxv().unwrap(), procfs_auxv);
}
}
}
diff --git a/library/stdarch/crates/std_detect/src/detect/os/x86.rs b/library/stdarch/crates/std_detect/src/detect/os/x86.rs
index d8afc1aca..d8dd84db4 100644
--- a/library/stdarch/crates/std_detect/src/detect/os/x86.rs
+++ b/library/stdarch/crates/std_detect/src/detect/os/x86.rs
@@ -49,11 +49,7 @@ pub(crate) fn detect_features() -> cache::Initializer {
ecx,
edx,
} = __cpuid(0);
- let vendor_id: [[u8; 4]; 3] = [
- mem::transmute(ebx),
- mem::transmute(edx),
- mem::transmute(ecx),
- ];
+ let vendor_id: [[u8; 4]; 3] = [ebx.to_ne_bytes(), edx.to_ne_bytes(), ecx.to_ne_bytes()];
let vendor_id: [u8; 12] = mem::transmute(vendor_id);
(max_basic_leaf, vendor_id)
};
diff --git a/library/stdarch/crates/stdarch-test/Cargo.toml b/library/stdarch/crates/stdarch-test/Cargo.toml
index 3a2130d4e..3682fcd7e 100644
--- a/library/stdarch/crates/stdarch-test/Cargo.toml
+++ b/library/stdarch/crates/stdarch-test/Cargo.toml
@@ -20,7 +20,7 @@ cc = "1.0"
# time, and we want to make updates to this explicit rather than automatically
# picking up updates which might break CI with new instruction names.
[target.'cfg(target_arch = "wasm32")'.dependencies]
-wasmprinter = "=0.2.53"
+wasmprinter = "=0.2.67"
[features]
default = []
diff --git a/library/stdarch/crates/stdarch-test/src/disassembly.rs b/library/stdarch/crates/stdarch-test/src/disassembly.rs
index 54df7261e..087fc46d4 100644
--- a/library/stdarch/crates/stdarch-test/src/disassembly.rs
+++ b/library/stdarch/crates/stdarch-test/src/disassembly.rs
@@ -81,6 +81,8 @@ pub(crate) fn disassemble_myself() -> HashSet<Function> {
let add_args = if cfg!(target_os = "macos") && cfg!(target_arch = "aarch64") {
// Target features need to be enabled for LLVM objdump on Macos ARM64
vec!["--mattr=+v8.6a,+crypto,+tme"]
+ } else if cfg!(target_arch = "riscv64") {
+ vec!["--mattr=+zk,+zks,+zbc,+zbb"]
} else {
vec![]
};
diff --git a/library/stdarch/crates/stdarch-verify/Cargo.toml b/library/stdarch/crates/stdarch-verify/Cargo.toml
index 10ae90074..515f05138 100644
--- a/library/stdarch/crates/stdarch-verify/Cargo.toml
+++ b/library/stdarch/crates/stdarch-verify/Cargo.toml
@@ -7,7 +7,7 @@ edition = "2021"
[dependencies]
proc-macro2 = "1.0"
quote = "1.0"
-syn = { version = "1.0", features = ["full"] }
+syn = { version = "2.0", features = ["full"] }
[lib]
proc-macro = true
@@ -15,5 +15,5 @@ test = false
[dev-dependencies]
serde = { version = "1.0", features = ['derive'] }
-serde-xml-rs = "0.3"
+serde-xml-rs = "0.6"
serde_json = "1.0.96"
diff --git a/library/stdarch/crates/stdarch-verify/src/lib.rs b/library/stdarch/crates/stdarch-verify/src/lib.rs
index a9bf89f70..3f9eb3bf9 100644
--- a/library/stdarch/crates/stdarch-verify/src/lib.rs
+++ b/library/stdarch/crates/stdarch-verify/src/lib.rs
@@ -7,6 +7,7 @@ extern crate syn;
use proc_macro::TokenStream;
use std::{fs::File, io::Read, path::Path};
use syn::ext::IdentExt;
+use syn::parse::Parser as _;
#[proc_macro]
pub fn x86_functions(input: TokenStream) -> TokenStream {
@@ -416,7 +417,7 @@ fn walk(root: &Path, files: &mut Vec<(syn::File, String)>) {
fn find_instrs(attrs: &[syn::Attribute]) -> Vec<String> {
struct AssertInstr {
- instr: String,
+ instr: Option<String>,
}
// A small custom parser to parse out the instruction in `assert_instr`.
@@ -424,15 +425,21 @@ fn find_instrs(attrs: &[syn::Attribute]) -> Vec<String> {
// TODO: should probably just reuse `Invoc` from the `assert-instr-macro`
// crate.
impl syn::parse::Parse for AssertInstr {
- fn parse(content: syn::parse::ParseStream<'_>) -> syn::Result<Self> {
- let input;
- parenthesized!(input in content);
- let _ = input.parse::<syn::Meta>()?;
- let _ = input.parse::<Token![,]>()?;
- let ident = input.parse::<syn::Ident>()?;
- if ident != "assert_instr" {
- return Err(input.error("expected `assert_instr`"));
+ fn parse(input: syn::parse::ParseStream<'_>) -> syn::Result<Self> {
+ let _ = input.parse::<syn::Meta>().unwrap();
+ let _ = input.parse::<Token![,]>().unwrap();
+
+ match input.parse::<syn::Ident>() {
+ Ok(ident) if ident == "assert_instr" => {}
+ _ => {
+ while !input.is_empty() {
+ // consume everything
+ drop(input.parse::<proc_macro2::TokenStream>());
+ }
+ return Ok(Self { instr: None });
+ }
}
+
let instrs;
parenthesized!(instrs in input);
@@ -452,18 +459,24 @@ fn find_instrs(attrs: &[syn::Attribute]) -> Vec<String> {
return Err(input.error("failed to parse instruction"));
}
}
- Ok(Self { instr })
+ Ok(Self { instr: Some(instr) })
}
}
attrs
.iter()
- .filter(|a| a.path.is_ident("cfg_attr"))
.filter_map(|a| {
- syn::parse2::<AssertInstr>(a.tokens.clone())
- .ok()
- .map(|a| a.instr)
+ if let syn::Meta::List(ref l) = a.meta {
+ if l.path.is_ident("cfg_attr") {
+ Some(l)
+ } else {
+ None
+ }
+ } else {
+ None
+ }
})
+ .filter_map(|l| syn::parse2::<AssertInstr>(l.tokens.clone()).unwrap().instr)
.collect()
}
@@ -471,19 +484,26 @@ fn find_target_feature(attrs: &[syn::Attribute]) -> Option<syn::Lit> {
attrs
.iter()
.flat_map(|a| {
- if let Ok(syn::Meta::List(i)) = a.parse_meta() {
- if i.path.is_ident("target_feature") {
- return i.nested;
+ if let syn::Meta::List(ref l) = a.meta {
+ if l.path.is_ident("target_feature") {
+ if let Ok(l) =
+ syn::punctuated::Punctuated::<syn::Meta, Token![,]>::parse_terminated
+ .parse2(l.tokens.clone())
+ {
+ return l;
+ }
}
}
syn::punctuated::Punctuated::new()
})
- .filter_map(|nested| match nested {
- syn::NestedMeta::Meta(m) => Some(m),
- syn::NestedMeta::Lit(_) => None,
- })
.find_map(|m| match m {
- syn::Meta::NameValue(ref i) if i.path.is_ident("enable") => Some(i.clone().lit),
+ syn::Meta::NameValue(i) if i.path.is_ident("enable") => {
+ if let syn::Expr::Lit(lit) = i.value {
+ Some(lit.lit)
+ } else {
+ None
+ }
+ }
_ => None,
})
}
@@ -491,9 +511,16 @@ fn find_target_feature(attrs: &[syn::Attribute]) -> Option<syn::Lit> {
fn find_required_const(name: &str, attrs: &[syn::Attribute]) -> Vec<usize> {
attrs
.iter()
- .flat_map(|a| {
- if a.path.segments[0].ident == name {
- syn::parse::<RustcArgsRequiredConst>(a.tokens.clone().into())
+ .filter_map(|a| {
+ if let syn::Meta::List(ref l) = a.meta {
+ Some(l)
+ } else {
+ None
+ }
+ })
+ .flat_map(|l| {
+ if l.path.segments[0].ident == name {
+ syn::parse2::<RustcArgsRequiredConst>(l.tokens.clone())
.unwrap()
.args
} else {
@@ -509,10 +536,7 @@ struct RustcArgsRequiredConst {
impl syn::parse::Parse for RustcArgsRequiredConst {
fn parse(input: syn::parse::ParseStream<'_>) -> syn::Result<Self> {
- let content;
- parenthesized!(content in input);
- let list =
- syn::punctuated::Punctuated::<syn::LitInt, Token![,]>::parse_terminated(&content)?;
+ let list = syn::punctuated::Punctuated::<syn::LitInt, Token![,]>::parse_terminated(&input)?;
Ok(Self {
args: list
.into_iter()
diff --git a/library/stdarch/examples/Cargo.toml b/library/stdarch/examples/Cargo.toml
index 38f497fa6..d9034dd80 100644
--- a/library/stdarch/examples/Cargo.toml
+++ b/library/stdarch/examples/Cargo.toml
@@ -13,8 +13,8 @@ default-run = "hex"
[dependencies]
core_arch = { path = "../crates/core_arch" }
std_detect = { path = "../crates/std_detect" }
-quickcheck = "0.9"
-rand = "0.7"
+quickcheck = "1.0"
+rand = "0.8"
[[bin]]
name = "hex"