From ef24de24a82fe681581cc130f342363c47c0969a Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Fri, 7 Jun 2024 07:48:48 +0200
Subject: Merging upstream version 1.75.0+dfsg1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 .../stdarch/crates/assert-instr-macro/Cargo.toml   |   2 +-
 .../stdarch/crates/assert-instr-macro/src/lib.rs   |  13 +-
 .../core_arch/src/arm_shared/barrier/cp15.rs       |   9 +-
 .../src/arm_shared/neon/shift_and_insert_tests.rs  |   4 +-
 library/stdarch/crates/core_arch/src/riscv64/zk.rs |  57 ++--
 .../crates/core_arch/src/riscv_shared/zb.rs        |  12 +-
 .../crates/core_arch/src/riscv_shared/zk.rs        |  30 +-
 library/stdarch/crates/core_arch/src/x86/avx.rs    |  40 +--
 library/stdarch/crates/core_arch/src/x86/avx2.rs   |  46 +--
 .../crates/core_arch/src/x86/avx512bitalg.rs       |  12 +-
 .../stdarch/crates/core_arch/src/x86/avx512bw.rs   | 104 +++----
 .../stdarch/crates/core_arch/src/x86/avx512f.rs    | 333 ++++++++-------------
 library/stdarch/crates/core_arch/src/x86/sse.rs    | 186 +++++++++---
 library/stdarch/crates/core_arch/src/x86/sse2.rs   | 309 ++++++++++++-------
 library/stdarch/crates/core_arch/src/x86/sse3.rs   |  18 +-
 library/stdarch/crates/core_arch/src/x86/sse41.rs  |  72 +++--
 library/stdarch/crates/core_arch/src/x86/test.rs   |   9 +
 .../stdarch/crates/core_arch/src/x86_64/avx512f.rs |  60 ++--
 .../stdarch/crates/core_arch/src/x86_64/sse2.rs    |   3 +
 library/stdarch/crates/intrinsic-test/Cargo.toml   |   6 +-
 library/stdarch/crates/intrinsic-test/README.md    |   4 +-
 .../crates/intrinsic-test/src/json_parser.rs       |   3 +-
 library/stdarch/crates/intrinsic-test/src/main.rs  |  93 +++---
 library/stdarch/crates/simd-test-macro/Cargo.toml  |   1 +
 library/stdarch/crates/simd-test-macro/src/lib.rs  |  39 +--
 library/stdarch/crates/std_detect/Cargo.toml       |   1 -
 .../std_detect/src/detect/os/linux/auxvec.rs       |  59 +---
 .../stdarch/crates/std_detect/src/detect/os/x86.rs |   6 +-
 library/stdarch/crates/stdarch-test/Cargo.toml     |   2 +-
 .../stdarch/crates/stdarch-test/src/disassembly.rs |   2 +
 library/stdarch/crates/stdarch-verify/Cargo.toml   |   4 +-
 library/stdarch/crates/stdarch-verify/src/lib.rs   |  82 +++--
 32 files changed, 843 insertions(+), 778 deletions(-)

(limited to 'library/stdarch/crates')

diff --git a/library/stdarch/crates/assert-instr-macro/Cargo.toml b/library/stdarch/crates/assert-instr-macro/Cargo.toml
index 4ad654e69..881c8109c 100644
--- a/library/stdarch/crates/assert-instr-macro/Cargo.toml
+++ b/library/stdarch/crates/assert-instr-macro/Cargo.toml
@@ -11,4 +11,4 @@ test = false
 [dependencies]
 proc-macro2 = "1.0"
 quote = "1.0"
-syn = { version = "1.0", features = ["full"] }
+syn = { version = "2.0", features = ["full"] }
diff --git a/library/stdarch/crates/assert-instr-macro/src/lib.rs b/library/stdarch/crates/assert-instr-macro/src/lib.rs
index 99e37c910..c9de43943 100644
--- a/library/stdarch/crates/assert-instr-macro/src/lib.rs
+++ b/library/stdarch/crates/assert-instr-macro/src/lib.rs
@@ -35,6 +35,15 @@ pub fn assert_instr(
 
     let instr = &invoc.instr;
     let name = &func.sig.ident;
+    let maybe_allow_deprecated = if func
+        .attrs
+        .iter()
+        .any(|attr| attr.path().is_ident("deprecated"))
+    {
+        quote! { #[allow(deprecated)] }
+    } else {
+        quote! {}
+    };
 
     // Disable assert_instr for x86 targets compiled with avx enabled, which
     // causes LLVM to generate different intrinsics that the ones we are
@@ -108,7 +117,7 @@ pub fn assert_instr(
         .attrs
         .iter()
         .filter(|attr| {
-            attr.path
+            attr.path()
                 .segments
                 .first()
                 .expect("attr.path.segments.first() failed")
@@ -135,6 +144,7 @@ pub fn assert_instr(
     let to_test = if disable_dedup_guard {
         quote! {
             #attrs
+            #maybe_allow_deprecated
             #[no_mangle]
             #[inline(never)]
             pub unsafe extern #abi fn #shim_name(#(#inputs),*) #ret {
@@ -147,6 +157,7 @@ pub fn assert_instr(
             const #shim_name_ptr : *const u8 = #shim_name_str.as_ptr();
 
             #attrs
+            #maybe_allow_deprecated
             #[no_mangle]
             #[inline(never)]
             pub unsafe extern #abi fn #shim_name(#(#inputs),*) #ret {
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs b/library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs
index 6faae0fee..fe540a7d8 100644
--- a/library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/barrier/cp15.rs
@@ -11,7 +11,8 @@ impl super::super::sealed::Dmb for SY {
     #[inline(always)]
     unsafe fn __dmb(&self) {
         asm!(
-            "mcr p15, 0, r0, c7, c10, 5",
+            "mcr p15, 0, {}, c7, c10, 5",
+            in(reg) 0_u32,
             options(preserves_flags, nostack)
         )
     }
@@ -21,7 +22,8 @@ impl super::super::sealed::Dsb for SY {
     #[inline(always)]
     unsafe fn __dsb(&self) {
         asm!(
-            "mcr p15, 0, r0, c7, c10, 4",
+            "mcr p15, 0, {}, c7, c10, 4",
+            in(reg) 0_u32,
             options(preserves_flags, nostack)
         )
     }
@@ -31,7 +33,8 @@ impl super::super::sealed::Isb for SY {
     #[inline(always)]
     unsafe fn __isb(&self) {
         asm!(
-            "mcr p15, 0, r0, c7, c5, 4",
+            "mcr p15, 0, {}, c7, c5, 4",
+            in(reg) 0_u32,
             options(preserves_flags, nostack)
         )
     }
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs
index ebb8b7b9e..54bffa450 100644
--- a/library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/shift_and_insert_tests.rs
@@ -22,7 +22,7 @@ macro_rules! test_vsli {
             let a = [$($a as $t),*];
             let b = [$($b as $t),*];
             let n_bit_mask: $t = (1 << $n) - 1;
-            let e = [$(($a as $t & n_bit_mask) | ($b as $t << $n)),*];
+            let e = [$(($a as $t & n_bit_mask) | (($b as $t) << $n)),*];
             let r = $fn_id::<$n>(transmute(a), transmute(b));
             let mut d = e;
             d = transmute(r);
@@ -60,7 +60,7 @@ macro_rules! test_vsri {
         unsafe fn $test_id() {
             let a = [$($a as $t),*];
             let b = [$($b as $t),*];
-            let n_bit_mask = ((1 as $t << $n) - 1).rotate_right($n);
+            let n_bit_mask = (((1 as $t) << $n) - 1).rotate_right($n);
             let e = [$(($a as $t & n_bit_mask) | (($b as $t >> $n) & !n_bit_mask)),*];
             let r = $fn_id::<$n>(transmute(a), transmute(b));
             let mut d = e;
diff --git a/library/stdarch/crates/core_arch/src/riscv64/zk.rs b/library/stdarch/crates/core_arch/src/riscv64/zk.rs
index 3dbe3705d..9b403fc95 100644
--- a/library/stdarch/crates/core_arch/src/riscv64/zk.rs
+++ b/library/stdarch/crates/core_arch/src/riscv64/zk.rs
@@ -20,6 +20,9 @@ extern "unadjusted" {
     #[link_name = "llvm.riscv.aes64ks2"]
     fn _aes64ks2(rs1: i64, rs2: i64) -> i64;
 
+    #[link_name = "llvm.riscv.aes64im"]
+    fn _aes64im(rs1: i64) -> i64;
+
     #[link_name = "llvm.riscv.sha512sig0"]
     fn _sha512sig0(rs1: i64) -> i64;
 
@@ -50,8 +53,7 @@ extern "unadjusted" {
 ///
 /// This function is safe to use if the `zkne` target feature is present.
 #[target_feature(enable = "zkne")]
-// See #1464
-// #[cfg_attr(test, assert_instr(aes64es))]
+#[cfg_attr(test, assert_instr(aes64es))]
 #[inline]
 pub unsafe fn aes64es(rs1: u64, rs2: u64) -> u64 {
     _aes64es(rs1 as i64, rs2 as i64) as u64
@@ -74,8 +76,7 @@ pub unsafe fn aes64es(rs1: u64, rs2: u64) -> u64 {
 ///
 /// This function is safe to use if the `zkne` target feature is present.
 #[target_feature(enable = "zkne")]
-// See #1464
-// #[cfg_attr(test, assert_instr(aes64esm))]
+#[cfg_attr(test, assert_instr(aes64esm))]
 #[inline]
 pub unsafe fn aes64esm(rs1: u64, rs2: u64) -> u64 {
     _aes64esm(rs1 as i64, rs2 as i64) as u64
@@ -98,8 +99,7 @@ pub unsafe fn aes64esm(rs1: u64, rs2: u64) -> u64 {
 ///
 /// This function is safe to use if the `zknd` target feature is present.
 #[target_feature(enable = "zknd")]
-// See #1464
-// #[cfg_attr(test, assert_instr(aes64ds))]
+#[cfg_attr(test, assert_instr(aes64ds))]
 #[inline]
 pub unsafe fn aes64ds(rs1: u64, rs2: u64) -> u64 {
     _aes64ds(rs1 as i64, rs2 as i64) as u64
@@ -122,8 +122,7 @@ pub unsafe fn aes64ds(rs1: u64, rs2: u64) -> u64 {
 ///
 /// This function is safe to use if the `zknd` target feature is present.
 #[target_feature(enable = "zknd")]
-// See #1464
-// #[cfg_attr(test, assert_instr(aes64dsm))]
+#[cfg_attr(test, assert_instr(aes64dsm))]
 #[inline]
 pub unsafe fn aes64dsm(rs1: u64, rs2: u64) -> u64 {
     _aes64dsm(rs1 as i64, rs2 as i64) as u64
@@ -152,8 +151,7 @@ pub unsafe fn aes64dsm(rs1: u64, rs2: u64) -> u64 {
 /// This function is safe to use if the `zkne` or `zknd` target feature is present.
 #[target_feature(enable = "zkne", enable = "zknd")]
 #[rustc_legacy_const_generics(1)]
-// See #1464
-// #[cfg_attr(test, assert_instr(aes64ks1i, RNUM = 0))]
+#[cfg_attr(test, assert_instr(aes64ks1i, RNUM = 0))]
 #[inline]
 pub unsafe fn aes64ks1i<const RNUM: u8>(rs1: u64) -> u64 {
     static_assert!(RNUM <= 10);
@@ -177,13 +175,36 @@ pub unsafe fn aes64ks1i<const RNUM: u8>(rs1: u64) -> u64 {
 ///
 /// This function is safe to use if the `zkne` or `zknd` target feature is present.
 #[target_feature(enable = "zkne", enable = "zknd")]
-// See #1464
-// #[cfg_attr(test, assert_instr(aes64ks2))]
+#[cfg_attr(test, assert_instr(aes64ks2))]
 #[inline]
 pub unsafe fn aes64ks2(rs1: u64, rs2: u64) -> u64 {
     _aes64ks2(rs1 as i64, rs2 as i64) as u64
 }
 
+/// This instruction accelerates the inverse MixColumns step of the AES Block Cipher, and is used to aid creation of
+/// the decryption KeySchedule.
+///
+/// The instruction applies the inverse MixColumns transformation to two columns of the state array, packed
+/// into a single 64-bit register. It is used to create the inverse cipher KeySchedule, according to the equivalent
+/// inverse cipher construction in (Page 23, Section 5.3.5). This instruction must always be implemented
+/// such that its execution latency does not depend on the data being operated on.
+///
+/// Source: RISC-V Cryptography Extensions Volume I: Scalar & Entropy Source Instructions
+///
+/// Version: v1.0.1
+///
+/// Section: 3.9
+///
+/// # Safety
+///
+/// This function is safe to use if the `zkne` or `zknd` target feature is present.
+#[target_feature(enable = "zkne", enable = "zknd")]
+#[cfg_attr(test, assert_instr(aes64im))]
+#[inline]
+pub unsafe fn aes64im(rs1: u64) -> u64 {
+    _aes64im(rs1 as i64) as u64
+}
+
 /// Implements the Sigma0 transformation function as used in the SHA2-512 hash function \[49\]
 /// (Section 4.1.3).
 ///
@@ -201,8 +222,7 @@ pub unsafe fn aes64ks2(rs1: u64, rs2: u64) -> u64 {
 ///
 /// This function is safe to use if the `zknh` target feature is present.
 #[target_feature(enable = "zknh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sha512sig0))]
+#[cfg_attr(test, assert_instr(sha512sig0))]
 #[inline]
 pub unsafe fn sha512sig0(rs1: u64) -> u64 {
     _sha512sig0(rs1 as i64) as u64
@@ -225,8 +245,7 @@ pub unsafe fn sha512sig0(rs1: u64) -> u64 {
 ///
 /// This function is safe to use if the `zknh` target feature is present.
 #[target_feature(enable = "zknh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sha512sig1))]
+#[cfg_attr(test, assert_instr(sha512sig1))]
 #[inline]
 pub unsafe fn sha512sig1(rs1: u64) -> u64 {
     _sha512sig1(rs1 as i64) as u64
@@ -249,8 +268,7 @@ pub unsafe fn sha512sig1(rs1: u64) -> u64 {
 ///
 /// This function is safe to use if the `zknh` target feature is present.
 #[target_feature(enable = "zknh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sha512sum0))]
+#[cfg_attr(test, assert_instr(sha512sum0))]
 #[inline]
 pub unsafe fn sha512sum0(rs1: u64) -> u64 {
     _sha512sum0(rs1 as i64) as u64
@@ -273,8 +291,7 @@ pub unsafe fn sha512sum0(rs1: u64) -> u64 {
 ///
 /// This function is safe to use if the `zknh` target feature is present.
 #[target_feature(enable = "zknh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sha512sum1))]
+#[cfg_attr(test, assert_instr(sha512sum1))]
 #[inline]
 pub unsafe fn sha512sum1(rs1: u64) -> u64 {
     _sha512sum1(rs1 as i64) as u64
diff --git a/library/stdarch/crates/core_arch/src/riscv_shared/zb.rs b/library/stdarch/crates/core_arch/src/riscv_shared/zb.rs
index cfae6caa5..6785c04fd 100644
--- a/library/stdarch/crates/core_arch/src/riscv_shared/zb.rs
+++ b/library/stdarch/crates/core_arch/src/riscv_shared/zb.rs
@@ -47,8 +47,7 @@ extern "unadjusted" {
 ///
 /// This function is safe to use if the `zbb` target feature is present.
 #[target_feature(enable = "zbb")]
-// See #1464
-// #[cfg_attr(test, assert_instr(orc.b))]
+#[cfg_attr(test, assert_instr(orc.b))]
 #[inline]
 pub unsafe fn orc_b(rs: usize) -> usize {
     #[cfg(target_arch = "riscv32")]
@@ -76,8 +75,7 @@ pub unsafe fn orc_b(rs: usize) -> usize {
 ///
 /// This function is safe to use if the `zbc` target feature is present.
 #[target_feature(enable = "zbc")]
-// See #1464
-// #[cfg_attr(test, assert_instr(clmul))]
+#[cfg_attr(test, assert_instr(clmul))]
 #[inline]
 pub unsafe fn clmul(rs1: usize, rs2: usize) -> usize {
     #[cfg(target_arch = "riscv32")]
@@ -105,8 +103,7 @@ pub unsafe fn clmul(rs1: usize, rs2: usize) -> usize {
 ///
 /// This function is safe to use if the `zbc` target feature is present.
 #[target_feature(enable = "zbc")]
-// See #1464
-// #[cfg_attr(test, assert_instr(clmulh))]
+#[cfg_attr(test, assert_instr(clmulh))]
 #[inline]
 pub unsafe fn clmulh(rs1: usize, rs2: usize) -> usize {
     #[cfg(target_arch = "riscv32")]
@@ -134,8 +131,7 @@ pub unsafe fn clmulh(rs1: usize, rs2: usize) -> usize {
 ///
 /// This function is safe to use if the `zbc` target feature is present.
 #[target_feature(enable = "zbc")]
-// See #1464
-// #[cfg_attr(test, assert_instr(clmulr))]
+#[cfg_attr(test, assert_instr(clmulr))]
 #[inline]
 pub unsafe fn clmulr(rs1: usize, rs2: usize) -> usize {
     #[cfg(target_arch = "riscv32")]
diff --git a/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs b/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs
index db97f72bc..5fc5b4cda 100644
--- a/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs
+++ b/library/stdarch/crates/core_arch/src/riscv_shared/zk.rs
@@ -62,8 +62,7 @@ extern "unadjusted" {
 ///
 /// This function is safe to use if the `zbkx` target feature is present.
 #[target_feature(enable = "zbkx")]
-// See #1464
-// #[cfg_attr(test, assert_instr(xperm8))]
+#[cfg_attr(test, assert_instr(xperm8))]
 #[inline]
 pub unsafe fn xperm8(rs1: usize, rs2: usize) -> usize {
     #[cfg(target_arch = "riscv32")]
@@ -94,8 +93,7 @@ pub unsafe fn xperm8(rs1: usize, rs2: usize) -> usize {
 ///
 /// This function is safe to use if the `zbkx` target feature is present.
 #[target_feature(enable = "zbkx")]
-// See #1464
-// #[cfg_attr(test, assert_instr(xperm4))]
+#[cfg_attr(test, assert_instr(xperm4))]
 #[inline]
 pub unsafe fn xperm4(rs1: usize, rs2: usize) -> usize {
     #[cfg(target_arch = "riscv32")]
@@ -129,8 +127,7 @@ pub unsafe fn xperm4(rs1: usize, rs2: usize) -> usize {
 ///
 /// This function is safe to use if the `zknh` target feature is present.
 #[target_feature(enable = "zknh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sha256sig0))]
+#[cfg_attr(test, assert_instr(sha256sig0))]
 #[inline]
 pub unsafe fn sha256sig0(rs1: u32) -> u32 {
     _sha256sig0(rs1 as i32) as u32
@@ -156,8 +153,7 @@ pub unsafe fn sha256sig0(rs1: u32) -> u32 {
 ///
 /// This function is safe to use if the `zknh` target feature is present.
 #[target_feature(enable = "zknh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sha256sig1))]
+#[cfg_attr(test, assert_instr(sha256sig1))]
 #[inline]
 pub unsafe fn sha256sig1(rs1: u32) -> u32 {
     _sha256sig1(rs1 as i32) as u32
@@ -183,8 +179,7 @@ pub unsafe fn sha256sig1(rs1: u32) -> u32 {
 ///
 /// This function is safe to use if the `zknh` target feature is present.
 #[target_feature(enable = "zknh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sha256sum0))]
+#[cfg_attr(test, assert_instr(sha256sum0))]
 #[inline]
 pub unsafe fn sha256sum0(rs1: u32) -> u32 {
     _sha256sum0(rs1 as i32) as u32
@@ -210,8 +205,7 @@ pub unsafe fn sha256sum0(rs1: u32) -> u32 {
 ///
 /// This function is safe to use if the `zknh` target feature is present.
 #[target_feature(enable = "zknh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sha256sum1))]
+#[cfg_attr(test, assert_instr(sha256sum1))]
 #[inline]
 pub unsafe fn sha256sum1(rs1: u32) -> u32 {
     _sha256sum1(rs1 as i32) as u32
@@ -288,8 +282,7 @@ pub unsafe fn sha256sum1(rs1: u32) -> u32 {
 /// ```
 #[target_feature(enable = "zksed")]
 #[rustc_legacy_const_generics(2)]
-// See #1464
-// #[cfg_attr(test, assert_instr(sm4ed, BS = 0))]
+#[cfg_attr(test, assert_instr(sm4ed, BS = 0))]
 #[inline]
 pub unsafe fn sm4ed<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
     static_assert!(BS < 4);
@@ -368,8 +361,7 @@ pub unsafe fn sm4ed<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
 /// ```
 #[target_feature(enable = "zksed")]
 #[rustc_legacy_const_generics(2)]
-// See #1464
-// #[cfg_attr(test, assert_instr(sm4ks, BS = 0))]
+#[cfg_attr(test, assert_instr(sm4ks, BS = 0))]
 #[inline]
 pub unsafe fn sm4ks<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
     static_assert!(BS < 4);
@@ -409,8 +401,7 @@ pub unsafe fn sm4ks<const BS: u8>(rs1: u32, rs2: u32) -> u32 {
 /// compression function `CF` uses the intermediate value `TT2` to calculate
 /// the variable `E` in one iteration for subsequent processes.
 #[target_feature(enable = "zksh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sm3p0))]
+#[cfg_attr(test, assert_instr(sm3p0))]
 #[inline]
 pub unsafe fn sm3p0(rs1: u32) -> u32 {
     _sm3p0(rs1 as i32) as u32
@@ -454,8 +445,7 @@ pub unsafe fn sm3p0(rs1: u32) -> u32 {
 /// ENDFOR
 /// ```
 #[target_feature(enable = "zksh")]
-// See #1464
-// #[cfg_attr(test, assert_instr(sm3p1))]
+#[cfg_attr(test, assert_instr(sm3p1))]
 #[inline]
 pub unsafe fn sm3p1(rs1: u32) -> u32 {
     _sm3p1(rs1 as i32) as u32
diff --git a/library/stdarch/crates/core_arch/src/x86/avx.rs b/library/stdarch/crates/core_arch/src/x86/avx.rs
index 00bcc1fa1..de5dc05b8 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx.rs
@@ -268,7 +268,11 @@ pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vaddsubpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
-    addsubpd256(a, b)
+    let a = a.as_f64x4();
+    let b = b.as_f64x4();
+    let add = simd_add(a, b);
+    let sub = simd_sub(a, b);
+    simd_shuffle!(add, sub, [4, 1, 6, 3])
 }
 
 /// Alternatively adds and subtracts packed single-precision (32-bit)
@@ -280,7 +284,11 @@ pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
 #[cfg_attr(test, assert_instr(vaddsubps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
-    addsubps256(a, b)
+    let a = a.as_f32x8();
+    let b = b.as_f32x8();
+    let add = simd_add(a, b);
+    let sub = simd_sub(a, b);
+    simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
 }
 
 /// Subtracts packed double-precision (64-bit) floating-point elements in `b`
@@ -511,7 +519,8 @@ pub unsafe fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vblendvpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    vblendvpd(a, b, c)
+    let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::splat(0));
+    transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
 }
 
 /// Blends packed single-precision (32-bit) floating-point elements from
@@ -523,7 +532,8 @@ pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
 #[cfg_attr(test, assert_instr(vblendvps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
-    vblendvps(a, b, c)
+    let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::splat(0));
+    transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
 }
 
 /// Conditionally multiplies the packed single-precision (32-bit) floating-point
@@ -2056,7 +2066,10 @@ pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
 #[cfg_attr(test, assert_instr(vmovmskpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
-    movmskpd256(a)
+    // Propagate the highest bit to the rest, because simd_bitmask
+    // requires all-1 or all-0.
+    let mask: i64x4 = simd_lt(transmute(a), i64x4::splat(0));
+    simd_bitmask::<i64x4, u8>(mask).into()
 }
 
 /// Sets each bit of the returned mask based on the most significant bit of the
@@ -2069,7 +2082,10 @@ pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
 #[cfg_attr(test, assert_instr(vmovmskps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
-    movmskps256(a)
+    // Propagate the highest bit to the rest, because simd_bitmask
+    // requires all-1 or all-0.
+    let mask: i32x8 = simd_lt(transmute(a), i32x8::splat(0));
+    simd_bitmask::<i32x8, u8>(mask).into()
 }
 
 /// Returns vector of type __m256d with all elements set to zero.
@@ -2904,20 +2920,12 @@ pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 {
 // LLVM intrinsics used in the above functions
 #[allow(improper_ctypes)]
 extern "C" {
-    #[link_name = "llvm.x86.avx.addsub.pd.256"]
-    fn addsubpd256(a: __m256d, b: __m256d) -> __m256d;
-    #[link_name = "llvm.x86.avx.addsub.ps.256"]
-    fn addsubps256(a: __m256, b: __m256) -> __m256;
     #[link_name = "llvm.x86.avx.round.pd.256"]
     fn roundpd256(a: __m256d, b: i32) -> __m256d;
     #[link_name = "llvm.x86.avx.round.ps.256"]
     fn roundps256(a: __m256, b: i32) -> __m256;
     #[link_name = "llvm.x86.avx.sqrt.ps.256"]
     fn sqrtps256(a: __m256) -> __m256;
-    #[link_name = "llvm.x86.avx.blendv.pd.256"]
-    fn vblendvpd(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
-    #[link_name = "llvm.x86.avx.blendv.ps.256"]
-    fn vblendvps(a: __m256, b: __m256, c: __m256) -> __m256;
     #[link_name = "llvm.x86.avx.dp.ps.256"]
     fn vdpps(a: __m256, b: __m256, imm8: i32) -> __m256;
     #[link_name = "llvm.x86.avx.hadd.pd.256"]
@@ -3026,10 +3034,6 @@ extern "C" {
     fn vtestcps(a: __m128, b: __m128) -> i32;
     #[link_name = "llvm.x86.avx.vtestnzc.ps"]
     fn vtestnzcps(a: __m128, b: __m128) -> i32;
-    #[link_name = "llvm.x86.avx.movmsk.pd.256"]
-    fn movmskpd256(a: __m256d) -> i32;
-    #[link_name = "llvm.x86.avx.movmsk.ps.256"]
-    fn movmskps256(a: __m256) -> i32;
     #[link_name = "llvm.x86.avx.min.ps.256"]
     fn vminps(a: __m256, b: __m256) -> __m256;
     #[link_name = "llvm.x86.avx.max.ps.256"]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs
index e23c795ee..243a4cdab 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs
@@ -344,7 +344,10 @@ pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpavgw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pavgw(a.as_u16x16(), b.as_u16x16()))
+    let a = simd_cast::<_, u32x16>(a.as_u16x16());
+    let b = simd_cast::<_, u32x16>(b.as_u16x16());
+    let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
+    transmute(simd_cast::<_, u16x16>(r))
 }
 
 /// Averages packed unsigned 8-bit integers in `a` and `b`.
@@ -355,7 +358,10 @@ pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpavgb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pavgb(a.as_u8x32(), b.as_u8x32()))
+    let a = simd_cast::<_, u16x32>(a.as_u8x32());
+    let b = simd_cast::<_, u16x32>(b.as_u8x32());
+    let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
+    transmute(simd_cast::<_, u8x32>(r))
 }
 
 /// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
@@ -458,7 +464,8 @@ pub unsafe fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
 #[cfg_attr(test, assert_instr(vpblendvb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
-    transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32()))
+    let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::splat(0));
+    transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
 }
 
 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
@@ -2060,7 +2067,9 @@ pub unsafe fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __
 #[cfg_attr(test, assert_instr(vpmuldq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pmuldq(a.as_i32x8(), b.as_i32x8()))
+    let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4()));
+    let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4()));
+    transmute(simd_mul(a, b))
 }
 
 /// Multiplies the low unsigned 32-bit integers from each packed 64-bit
@@ -2074,7 +2083,10 @@ pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpmuludq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pmuludq(a.as_u32x8(), b.as_u32x8()))
+    let a = a.as_u64x4();
+    let b = b.as_u64x4();
+    let mask = u64x4::splat(u32::MAX.into());
+    transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
 }
 
 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
@@ -2087,7 +2099,10 @@ pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpmulhw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pmulhw(a.as_i16x16(), b.as_i16x16()))
+    let a = simd_cast::<_, i32x16>(a.as_i16x16());
+    let b = simd_cast::<_, i32x16>(b.as_i16x16());
+    let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
+    transmute(simd_cast::<i32x16, i16x16>(r))
 }
 
 /// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
@@ -2100,7 +2115,10 @@ pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpmulhuw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pmulhuw(a.as_u16x16(), b.as_u16x16()))
+    let a = simd_cast::<_, u32x16>(a.as_u16x16());
+    let b = simd_cast::<_, u32x16>(b.as_u16x16());
+    let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
+    transmute(simd_cast::<u32x16, u16x16>(r))
 }
 
 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
@@ -3629,12 +3647,6 @@ extern "C" {
     fn pabsw(a: i16x16) -> u16x16;
     #[link_name = "llvm.x86.avx2.pabs.d"]
     fn pabsd(a: i32x8) -> u32x8;
-    #[link_name = "llvm.x86.avx2.pavg.b"]
-    fn pavgb(a: u8x32, b: u8x32) -> u8x32;
-    #[link_name = "llvm.x86.avx2.pavg.w"]
-    fn pavgw(a: u16x16, b: u16x16) -> u16x16;
-    #[link_name = "llvm.x86.avx2.pblendvb"]
-    fn pblendvb(a: i8x32, b: i8x32, mask: i8x32) -> i8x32;
     #[link_name = "llvm.x86.avx2.phadd.w"]
     fn phaddw(a: i16x16, b: i16x16) -> i16x16;
     #[link_name = "llvm.x86.avx2.phadd.d"]
@@ -3669,14 +3681,6 @@ extern "C" {
     fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
     #[link_name = "llvm.x86.avx2.mpsadbw"]
     fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
-    #[link_name = "llvm.x86.avx2.pmulhu.w"]
-    fn pmulhuw(a: u16x16, b: u16x16) -> u16x16;
-    #[link_name = "llvm.x86.avx2.pmulh.w"]
-    fn pmulhw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.pmul.dq"]
-    fn pmuldq(a: i32x8, b: i32x8) -> i64x4;
-    #[link_name = "llvm.x86.avx2.pmulu.dq"]
-    fn pmuludq(a: u32x8, b: u32x8) -> u64x4;
     #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
     fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
     #[link_name = "llvm.x86.avx2.packsswb"]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs b/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
index 92e572eb1..ce4e402a8 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
@@ -311,7 +311,7 @@ pub unsafe fn _mm_mask_popcnt_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __
 #[target_feature(enable = "avx512bitalg")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
 pub unsafe fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64 {
-    transmute(bitshuffle_512(b.as_i8x64(), c.as_i8x64(), !0))
+    bitshuffle_512(b.as_i8x64(), c.as_i8x64(), !0)
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -326,7 +326,7 @@ pub unsafe fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64
 #[target_feature(enable = "avx512bitalg")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
 pub unsafe fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m512i) -> __mmask64 {
-    transmute(bitshuffle_512(b.as_i8x64(), c.as_i8x64(), k))
+    bitshuffle_512(b.as_i8x64(), c.as_i8x64(), k)
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -338,7 +338,7 @@ pub unsafe fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
 pub unsafe fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32 {
-    transmute(bitshuffle_256(b.as_i8x32(), c.as_i8x32(), !0))
+    bitshuffle_256(b.as_i8x32(), c.as_i8x32(), !0)
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -353,7 +353,7 @@ pub unsafe fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
 pub unsafe fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m256i) -> __mmask32 {
-    transmute(bitshuffle_256(b.as_i8x32(), c.as_i8x32(), k))
+    bitshuffle_256(b.as_i8x32(), c.as_i8x32(), k)
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -365,7 +365,7 @@ pub unsafe fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
 pub unsafe fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
-    transmute(bitshuffle_128(b.as_i8x16(), c.as_i8x16(), !0))
+    bitshuffle_128(b.as_i8x16(), c.as_i8x16(), !0)
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -380,7 +380,7 @@ pub unsafe fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
 pub unsafe fn _mm_mask_bitshuffle_epi64_mask(k: __mmask16, b: __m128i, c: __m128i) -> __mmask16 {
-    transmute(bitshuffle_128(b.as_i8x16(), c.as_i8x16(), k))
+    bitshuffle_128(b.as_i8x16(), c.as_i8x16(), k)
 }
 
 #[cfg(test)]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
index 364023539..0b4a56d36 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
@@ -3703,8 +3703,7 @@ pub unsafe fn _mm512_cmp_epu16_mask<const IMM8: i32>(a: __m512i, b: __m512i) ->
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u16x32();
     let b = b.as_u16x32();
-    let r = vpcmpuw(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
-    transmute(r)
+    vpcmpuw(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3722,8 +3721,7 @@ pub unsafe fn _mm512_mask_cmp_epu16_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u16x32();
     let b = b.as_u16x32();
-    let r = vpcmpuw(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpuw(a, b, IMM8, k1)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3737,8 +3735,7 @@ pub unsafe fn _mm256_cmp_epu16_mask<const IMM8: i32>(a: __m256i, b: __m256i) ->
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u16x16();
     let b = b.as_u16x16();
-    let r = vpcmpuw256(a, b, IMM8, 0b11111111_11111111);
-    transmute(r)
+    vpcmpuw256(a, b, IMM8, 0b11111111_11111111)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3756,8 +3753,7 @@ pub unsafe fn _mm256_mask_cmp_epu16_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u16x16();
     let b = b.as_u16x16();
-    let r = vpcmpuw256(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpuw256(a, b, IMM8, k1)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3771,8 +3767,7 @@ pub unsafe fn _mm_cmp_epu16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __m
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u16x8();
     let b = b.as_u16x8();
-    let r = vpcmpuw128(a, b, IMM8, 0b11111111);
-    transmute(r)
+    vpcmpuw128(a, b, IMM8, 0b11111111)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3790,8 +3785,7 @@ pub unsafe fn _mm_mask_cmp_epu16_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u16x8();
     let b = b.as_u16x8();
-    let r = vpcmpuw128(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpuw128(a, b, IMM8, k1)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3805,13 +3799,12 @@ pub unsafe fn _mm512_cmp_epu8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> _
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u8x64();
     let b = b.as_u8x64();
-    let r = vpcmpub(
+    vpcmpub(
         a,
         b,
         IMM8,
         0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-    );
-    transmute(r)
+    )
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3829,8 +3822,7 @@ pub unsafe fn _mm512_mask_cmp_epu8_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u8x64();
     let b = b.as_u8x64();
-    let r = vpcmpub(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpub(a, b, IMM8, k1)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3844,8 +3836,7 @@ pub unsafe fn _mm256_cmp_epu8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> _
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u8x32();
     let b = b.as_u8x32();
-    let r = vpcmpub256(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
-    transmute(r)
+    vpcmpub256(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3863,8 +3854,7 @@ pub unsafe fn _mm256_mask_cmp_epu8_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u8x32();
     let b = b.as_u8x32();
-    let r = vpcmpub256(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpub256(a, b, IMM8, k1)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3878,8 +3868,7 @@ pub unsafe fn _mm_cmp_epu8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mm
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u8x16();
     let b = b.as_u8x16();
-    let r = vpcmpub128(a, b, IMM8, 0b11111111_11111111);
-    transmute(r)
+    vpcmpub128(a, b, IMM8, 0b11111111_11111111)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3897,8 +3886,7 @@ pub unsafe fn _mm_mask_cmp_epu8_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u8x16();
     let b = b.as_u8x16();
-    let r = vpcmpub128(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpub128(a, b, IMM8, k1)
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3912,8 +3900,7 @@ pub unsafe fn _mm512_cmp_epi16_mask<const IMM8: i32>(a: __m512i, b: __m512i) ->
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i16x32();
     let b = b.as_i16x32();
-    let r = vpcmpw(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
-    transmute(r)
+    vpcmpw(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3931,8 +3918,7 @@ pub unsafe fn _mm512_mask_cmp_epi16_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i16x32();
     let b = b.as_i16x32();
-    let r = vpcmpw(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpw(a, b, IMM8, k1)
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3946,8 +3932,7 @@ pub unsafe fn _mm256_cmp_epi16_mask<const IMM8: i32>(a: __m256i, b: __m256i) ->
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i16x16();
     let b = b.as_i16x16();
-    let r = vpcmpw256(a, b, IMM8, 0b11111111_11111111);
-    transmute(r)
+    vpcmpw256(a, b, IMM8, 0b11111111_11111111)
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3965,8 +3950,7 @@ pub unsafe fn _mm256_mask_cmp_epi16_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i16x16();
     let b = b.as_i16x16();
-    let r = vpcmpw256(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpw256(a, b, IMM8, k1)
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3980,8 +3964,7 @@ pub unsafe fn _mm_cmp_epi16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __m
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i16x8();
     let b = b.as_i16x8();
-    let r = vpcmpw128(a, b, IMM8, 0b11111111);
-    transmute(r)
+    vpcmpw128(a, b, IMM8, 0b11111111)
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3999,8 +3982,7 @@ pub unsafe fn _mm_mask_cmp_epi16_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i16x8();
     let b = b.as_i16x8();
-    let r = vpcmpw128(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpw128(a, b, IMM8, k1)
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4014,13 +3996,12 @@ pub unsafe fn _mm512_cmp_epi8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> _
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i8x64();
     let b = b.as_i8x64();
-    let r = vpcmpb(
+    vpcmpb(
         a,
         b,
         IMM8,
         0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-    );
-    transmute(r)
+    )
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4038,8 +4019,7 @@ pub unsafe fn _mm512_mask_cmp_epi8_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i8x64();
     let b = b.as_i8x64();
-    let r = vpcmpb(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpb(a, b, IMM8, k1)
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4053,8 +4033,7 @@ pub unsafe fn _mm256_cmp_epi8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> _
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i8x32();
     let b = b.as_i8x32();
-    let r = vpcmpb256(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
-    transmute(r)
+    vpcmpb256(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4072,8 +4051,7 @@ pub unsafe fn _mm256_mask_cmp_epi8_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i8x32();
     let b = b.as_i8x32();
-    let r = vpcmpb256(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpb256(a, b, IMM8, k1)
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4087,8 +4065,7 @@ pub unsafe fn _mm_cmp_epi8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mm
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i8x16();
     let b = b.as_i8x16();
-    let r = vpcmpb128(a, b, IMM8, 0b11111111_11111111);
-    transmute(r)
+    vpcmpb128(a, b, IMM8, 0b11111111_11111111)
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4106,8 +4083,7 @@ pub unsafe fn _mm_mask_cmp_epi8_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i8x16();
     let b = b.as_i8x16();
-    let r = vpcmpb128(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpb128(a, b, IMM8, k1)
 }
 
 /// Load 512-bits (composed of 32 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
@@ -8566,7 +8542,7 @@ pub unsafe fn _mm_movm_epi8(k: __mmask16) -> __m128i {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    transmute(a + b)
+    a + b
 }
 
 /// Add 64-bit masks in a and b, and store the result in k.
@@ -8575,7 +8551,7 @@ pub unsafe fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    transmute(a + b)
+    a + b
 }
 
 /// Compute the bitwise AND of 32-bit masks a and b, and store the result in k.
@@ -8584,7 +8560,7 @@ pub unsafe fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    transmute(a & b)
+    a & b
 }
 
 /// Compute the bitwise AND of 64-bit masks a and b, and store the result in k.
@@ -8593,7 +8569,7 @@ pub unsafe fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    transmute(a & b)
+    a & b
 }
 
 /// Compute the bitwise NOT of 32-bit mask a, and store the result in k.
@@ -8602,7 +8578,7 @@ pub unsafe fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _knot_mask32(a: __mmask32) -> __mmask32 {
-    transmute(a ^ 0b11111111_11111111_11111111_11111111)
+    a ^ 0b11111111_11111111_11111111_11111111
 }
 
 /// Compute the bitwise NOT of 64-bit mask a, and store the result in k.
@@ -8611,7 +8587,7 @@ pub unsafe fn _knot_mask32(a: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _knot_mask64(a: __mmask64) -> __mmask64 {
-    transmute(a ^ 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111)
+    a ^ 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
 }
 
 /// Compute the bitwise NOT of 32-bit masks a and then AND with b, and store the result in k.
@@ -8620,7 +8596,7 @@ pub unsafe fn _knot_mask64(a: __mmask64) -> __mmask64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    transmute(_knot_mask32(a) & b)
+    _knot_mask32(a) & b
 }
 
 /// Compute the bitwise NOT of 64-bit masks a and then AND with b, and store the result in k.
@@ -8629,7 +8605,7 @@ pub unsafe fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    transmute(_knot_mask64(a) & b)
+    _knot_mask64(a) & b
 }
 
 /// Compute the bitwise OR of 32-bit masks a and b, and store the result in k.
@@ -8638,7 +8614,7 @@ pub unsafe fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    transmute(a | b)
+    a | b
 }
 
 /// Compute the bitwise OR of 64-bit masks a and b, and store the result in k.
@@ -8647,7 +8623,7 @@ pub unsafe fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    transmute(a | b)
+    a | b
 }
 
 /// Compute the bitwise XOR of 32-bit masks a and b, and store the result in k.
@@ -8656,7 +8632,7 @@ pub unsafe fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    transmute(a ^ b)
+    a ^ b
 }
 
 /// Compute the bitwise XOR of 64-bit masks a and b, and store the result in k.
@@ -8665,7 +8641,7 @@ pub unsafe fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    transmute(a ^ b)
+    a ^ b
 }
 
 /// Compute the bitwise XNOR of 32-bit masks a and b, and store the result in k.
@@ -8674,7 +8650,7 @@ pub unsafe fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    transmute(_knot_mask32(a ^ b))
+    _knot_mask32(a ^ b)
 }
 
 /// Compute the bitwise XNOR of 64-bit masks a and b, and store the result in k.
@@ -8683,7 +8659,7 @@ pub unsafe fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kxnor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    transmute(_knot_mask64(a ^ b))
+    _knot_mask64(a ^ b)
 }
 
 /// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
index 5412237ca..280135292 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
@@ -17144,7 +17144,7 @@ pub unsafe fn _mm512_slli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
     if IMM8 >= 32 {
         _mm512_setzero_si512()
     } else {
-        transmute(simd_shl(a.as_u32x16(), u32x16::splat(IMM8 as u32)))
+        transmute(simd_shl(a.as_u32x16(), u32x16::splat(IMM8)))
     }
 }
 
@@ -20132,7 +20132,7 @@ pub unsafe fn _mm512_maskz_permutexvar_epi32(k: __mmask16, idx: __m512i, a: __m5
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermd
 pub unsafe fn _mm256_permutexvar_epi32(idx: __m256i, a: __m256i) -> __m256i {
-    transmute(_mm256_permutevar8x32_epi32(a, idx)) // llvm use llvm.x86.avx2.permd
+    _mm256_permutevar8x32_epi32(a, idx) // llvm use llvm.x86.avx2.permd
 }
 
 /// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20284,7 +20284,7 @@ pub unsafe fn _mm512_maskz_permutexvar_ps(k: __mmask16, idx: __m512i, a: __m512)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpermps))]
 pub unsafe fn _mm256_permutexvar_ps(idx: __m256i, a: __m256) -> __m256 {
-    transmute(_mm256_permutevar8x32_ps(a, idx)) //llvm.x86.avx2.permps
+    _mm256_permutevar8x32_ps(a, idx) //llvm.x86.avx2.permps
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23943,7 +23943,7 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
 #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(vmovd))]
 pub unsafe fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
     let extract: i32 = simd_extract(a.as_i32x16(), 0);
-    transmute(extract)
+    extract
 }
 
 /// Broadcast the low packed 32-bit integer from a to all elements of dst.
@@ -25744,7 +25744,7 @@ pub unsafe fn _mm512_andnot_si512(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
 pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
-    transmute(a & b)
+    a & b
 }
 
 /// Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
@@ -25754,7 +25754,7 @@ pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
 pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 {
-    transmute(a & b)
+    a & b
 }
 
 /// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
@@ -25764,7 +25764,7 @@ pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
 pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
-    transmute(a | b)
+    a | b
 }
 
 /// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
@@ -25774,7 +25774,7 @@ pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
 pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 {
-    transmute(a | b)
+    a | b
 }
 
 /// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
@@ -25784,7 +25784,7 @@ pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
 pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
-    transmute(a ^ b)
+    a ^ b
 }
 
 /// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
@@ -25794,7 +25794,7 @@ pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
 pub unsafe fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 {
-    transmute(a ^ b)
+    a ^ b
 }
 
 /// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
@@ -25803,7 +25803,7 @@ pub unsafe fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _knot_mask16(a: __mmask16) -> __mmask16 {
-    transmute(a ^ 0b11111111_11111111)
+    a ^ 0b11111111_11111111
 }
 
 /// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
@@ -25812,7 +25812,7 @@ pub unsafe fn _knot_mask16(a: __mmask16) -> __mmask16 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_knot(a: __mmask16) -> __mmask16 {
-    transmute(a ^ 0b11111111_11111111)
+    a ^ 0b11111111_11111111
 }
 
 /// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k.
@@ -25862,8 +25862,7 @@ pub unsafe fn _mm512_kxnor(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
 pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 {
-    let r: u16 = a;
-    transmute(r)
+    a
 }
 
 /// Converts integer mask into bitmask, storing the result in dst.
@@ -25872,8 +25871,7 @@ pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 {
 #[inline]
 #[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
 pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
-    let r: u16 = mask as u16;
-    transmute(r)
+    mask as u16
 }
 
 /// Converts bit mask k1 into an integer value, storing the results in dst.
@@ -25883,8 +25881,7 @@ pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
 pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
-    let r: i32 = k1 as i32;
-    transmute(r)
+    k1 as i32
 }
 
 /// Unpack and interleave 8 bits from masks a and b, and store the 16-bit result in k.
@@ -25896,7 +25893,7 @@ pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
 pub unsafe fn _mm512_kunpackb(a: __mmask16, b: __mmask16) -> __mmask16 {
     let a = a & 0b00000000_11111111;
     let b = b & 0b11111111_00000000;
-    transmute(a | b)
+    a | b
 }
 
 /// Performs bitwise OR between k1 and k2, storing the result in dst. CF flag is set if dst consists of all 1's.
@@ -32352,8 +32349,7 @@ pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -
     if (k & 0b00000001) != 0 {
         mov = simd_extract(b, 0);
     }
-    let r = simd_insert(a, 0, mov);
-    transmute(r)
+    simd_insert(a, 0, mov)
 }
 
 /// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32367,8 +32363,7 @@ pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
     if (k & 0b00000001) != 0 {
         mov = simd_extract(b, 0);
     }
-    let r = simd_insert(a, 0, mov);
-    transmute(r)
+    simd_insert(a, 0, mov)
 }
 
 /// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32383,8 +32378,7 @@ pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d
     if (k & 0b00000001) != 0 {
         mov = simd_extract(b, 0);
     }
-    let r = simd_insert(a, 0, mov);
-    transmute(r)
+    simd_insert(a, 0, mov)
 }
 
 /// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32398,8 +32392,7 @@ pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d
     if (k & 0b00000001) != 0 {
         mov = simd_extract(b, 0);
     }
-    let r = simd_insert(a, 0, mov);
-    transmute(r)
+    simd_insert(a, 0, mov)
 }
 
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32416,8 +32409,7 @@ pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
         let extractb: f32 = simd_extract(b, 0);
         add = extracta + extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32433,8 +32425,7 @@ pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
         let extractb: f32 = simd_extract(b, 0);
         add = extracta + extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32451,8 +32442,7 @@ pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
         let extractb: f64 = simd_extract(b, 0);
         add = extracta + extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32468,8 +32458,7 @@ pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
         let extractb: f64 = simd_extract(b, 0);
         add = extracta + extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32486,8 +32475,7 @@ pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
         let extractb: f32 = simd_extract(b, 0);
         add = extracta - extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32503,8 +32491,7 @@ pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
         let extractb: f32 = simd_extract(b, 0);
         add = extracta - extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32521,8 +32508,7 @@ pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
         let extractb: f64 = simd_extract(b, 0);
         add = extracta - extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32538,8 +32524,7 @@ pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
         let extractb: f64 = simd_extract(b, 0);
         add = extracta - extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32556,8 +32541,7 @@ pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
         let extractb: f32 = simd_extract(b, 0);
         add = extracta * extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32573,8 +32557,7 @@ pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
         let extractb: f32 = simd_extract(b, 0);
         add = extracta * extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32591,8 +32574,7 @@ pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
         let extractb: f64 = simd_extract(b, 0);
         add = extracta * extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32608,8 +32590,7 @@ pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
         let extractb: f64 = simd_extract(b, 0);
         add = extracta * extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32626,8 +32607,7 @@ pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
         let extractb: f32 = simd_extract(b, 0);
         add = extracta / extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32643,8 +32623,7 @@ pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
         let extractb: f32 = simd_extract(b, 0);
         add = extracta / extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32661,8 +32640,7 @@ pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
         let extractb: f64 = simd_extract(b, 0);
         add = extracta / extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32678,8 +32656,7 @@ pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
         let extractb: f64 = simd_extract(b, 0);
         add = extracta / extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33587,8 +33564,7 @@ pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) ->
         let extractc: f32 = simd_extract(c, 0);
         fmadd = vfmadd132ss(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+    simd_insert(a, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33605,8 +33581,7 @@ pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -
         let extractc: f32 = simd_extract(c, 0);
         fmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+    simd_insert(a, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -33622,8 +33597,7 @@ pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -
         let extractb: f32 = simd_extract(b, 0);
         fmadd = vfmadd132ss(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(c, 0, fmadd);
-    transmute(r)
+    simd_insert(c, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33639,8 +33613,7 @@ pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d)
         let extractc: f64 = simd_extract(c, 0);
         fmadd = vfmadd132sd(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+    simd_insert(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33657,8 +33630,7 @@ pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
         let extractc: f64 = simd_extract(c, 0);
         fmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+    simd_insert(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -33674,8 +33646,7 @@ pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
         let extractb: f64 = simd_extract(b, 0);
         fmadd = vfmadd132sd(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(c, 0, fmadd);
-    transmute(r)
+    simd_insert(c, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33692,8 +33663,7 @@ pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) ->
         let extractc = -extractc;
         fmsub = vfmadd132ss(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33711,8 +33681,7 @@ pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -
         let extractc = -extractc;
         fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -33729,8 +33698,7 @@ pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -
         let extractc = -fmsub;
         fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(c, 0, fmsub);
-    transmute(r)
+    simd_insert(c, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33747,8 +33715,7 @@ pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d)
         let extractc = -extractc;
         fmsub = vfmadd132sd(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33766,8 +33733,7 @@ pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
         let extractc = -extractc;
         fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -33784,8 +33750,7 @@ pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
         let extractc = -fmsub;
         fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(c, 0, fmsub);
-    transmute(r)
+    simd_insert(c, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33802,8 +33767,7 @@ pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -
         let extractc: f32 = simd_extract(c, 0);
         fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33821,8 +33785,7 @@ pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128)
         let extractc: f32 = simd_extract(c, 0);
         fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -33839,8 +33802,7 @@ pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8)
         let extractb: f32 = simd_extract(b, 0);
         fnmadd = vfmadd132ss(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(c, 0, fnmadd);
-    transmute(r)
+    simd_insert(c, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33857,8 +33819,7 @@ pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
         let extractc: f64 = simd_extract(c, 0);
         fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33876,8 +33837,7 @@ pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
         let extractc: f64 = simd_extract(c, 0);
         fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -33894,8 +33854,7 @@ pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
         let extractb: f64 = simd_extract(b, 0);
         fnmadd = vfmadd132sd(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(c, 0, fnmadd);
-    transmute(r)
+    simd_insert(c, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33913,8 +33872,7 @@ pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -
         let extractc = -extractc;
         fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33933,8 +33891,7 @@ pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128)
         let extractc = -extractc;
         fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -33952,8 +33909,7 @@ pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8)
         let extractc = -fnmsub;
         fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(c, 0, fnmsub);
-    transmute(r)
+    simd_insert(c, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33971,8 +33927,7 @@ pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
         let extractc = -extractc;
         fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33991,8 +33946,7 @@ pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
         let extractc = -extractc;
         fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -34010,8 +33964,7 @@ pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
         let extractc = -fnmsub;
         fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(c, 0, fnmsub);
-    transmute(r)
+    simd_insert(c, 0, fnmsub)
 }
 
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35705,8 +35658,7 @@ pub unsafe fn _mm_fmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: _
     let extractb: f32 = simd_extract(b, 0);
     let extractc: f32 = simd_extract(c, 0);
     let r = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
-    let r = simd_insert(a, 0, r);
-    transmute(r)
+    simd_insert(a, 0, r)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35736,8 +35688,7 @@ pub unsafe fn _mm_mask_fmadd_round_ss<const ROUNDING: i32>(
         let extractc: f32 = simd_extract(c, 0);
         fmadd = vfmadd132ss(fmadd, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+    simd_insert(a, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35768,8 +35719,7 @@ pub unsafe fn _mm_maskz_fmadd_round_ss<const ROUNDING: i32>(
         let extractc: f32 = simd_extract(c, 0);
         fmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+    simd_insert(a, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -35799,8 +35749,7 @@ pub unsafe fn _mm_mask3_fmadd_round_ss<const ROUNDING: i32>(
         let extractb: f32 = simd_extract(b, 0);
         fmadd = vfmadd132ss(extracta, extractb, fmadd, ROUNDING);
     }
-    let r = simd_insert(c, 0, fmadd);
-    transmute(r)
+    simd_insert(c, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -35827,8 +35776,7 @@ pub unsafe fn _mm_fmadd_round_sd<const ROUNDING: i32>(
     let extractb: f64 = simd_extract(b, 0);
     let extractc: f64 = simd_extract(c, 0);
     let fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+    simd_insert(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -35858,8 +35806,7 @@ pub unsafe fn _mm_mask_fmadd_round_sd<const ROUNDING: i32>(
         let extractc: f64 = simd_extract(c, 0);
         fmadd = vfmadd132sd(fmadd, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+    simd_insert(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -35890,8 +35837,7 @@ pub unsafe fn _mm_maskz_fmadd_round_sd<const ROUNDING: i32>(
         let extractc: f64 = simd_extract(c, 0);
         fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+    simd_insert(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -35921,8 +35867,7 @@ pub unsafe fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
         let extractb: f64 = simd_extract(b, 0);
         fmadd = vfmadd132sd(extracta, extractb, fmadd, ROUNDING);
     }
-    let r = simd_insert(c, 0, fmadd);
-    transmute(r)
+    simd_insert(c, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35946,8 +35891,7 @@ pub unsafe fn _mm_fmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: _
     let extractc: f32 = simd_extract(c, 0);
     let extractc = -extractc;
     let fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35978,8 +35922,7 @@ pub unsafe fn _mm_mask_fmsub_round_ss<const ROUNDING: i32>(
         let extractc = -extractc;
         fmsub = vfmadd132ss(fmsub, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36011,8 +35954,7 @@ pub unsafe fn _mm_maskz_fmsub_round_ss<const ROUNDING: i32>(
         let extractc = -extractc;
         fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -36043,8 +35985,7 @@ pub unsafe fn _mm_mask3_fmsub_round_ss<const ROUNDING: i32>(
         let extractc = -fmsub;
         fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(c, 0, fmsub);
-    transmute(r)
+    simd_insert(c, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -36072,8 +36013,7 @@ pub unsafe fn _mm_fmsub_round_sd<const ROUNDING: i32>(
     let extractc: f64 = simd_extract(c, 0);
     let extractc = -extractc;
     let fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36104,8 +36044,7 @@ pub unsafe fn _mm_mask_fmsub_round_sd<const ROUNDING: i32>(
         let extractc = -extractc;
         fmsub = vfmadd132sd(fmsub, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36137,8 +36076,7 @@ pub unsafe fn _mm_maskz_fmsub_round_sd<const ROUNDING: i32>(
         let extractc = -extractc;
         fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -36169,8 +36107,7 @@ pub unsafe fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
         let extractc = -fmsub;
         fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(c, 0, fmsub);
-    transmute(r)
+    simd_insert(c, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36194,8 +36131,7 @@ pub unsafe fn _mm_fnmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c:
     let extractb: f32 = simd_extract(b, 0);
     let extractc: f32 = simd_extract(c, 0);
     let fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36226,8 +36162,7 @@ pub unsafe fn _mm_mask_fnmadd_round_ss<const ROUNDING: i32>(
         let extractc: f32 = simd_extract(c, 0);
         fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36259,8 +36194,7 @@ pub unsafe fn _mm_maskz_fnmadd_round_ss<const ROUNDING: i32>(
         let extractc: f32 = simd_extract(c, 0);
         fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -36291,8 +36225,7 @@ pub unsafe fn _mm_mask3_fnmadd_round_ss<const ROUNDING: i32>(
         let extractb: f32 = simd_extract(b, 0);
         fnmadd = vfmadd132ss(extracta, extractb, fnmadd, ROUNDING);
     }
-    let r = simd_insert(c, 0, fnmadd);
-    transmute(r)
+    simd_insert(c, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -36320,8 +36253,7 @@ pub unsafe fn _mm_fnmadd_round_sd<const ROUNDING: i32>(
     let extractb: f64 = simd_extract(b, 0);
     let extractc: f64 = simd_extract(c, 0);
     let fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36352,8 +36284,7 @@ pub unsafe fn _mm_mask_fnmadd_round_sd<const ROUNDING: i32>(
         let extractc: f64 = simd_extract(c, 0);
         fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36385,8 +36316,7 @@ pub unsafe fn _mm_maskz_fnmadd_round_sd<const ROUNDING: i32>(
         let extractc: f64 = simd_extract(c, 0);
         fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -36417,8 +36347,7 @@ pub unsafe fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
         let extractb: f64 = simd_extract(b, 0);
         fnmadd = vfmadd132sd(extracta, extractb, fnmadd, ROUNDING);
     }
-    let r = simd_insert(c, 0, fnmadd);
-    transmute(r)
+    simd_insert(c, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36443,8 +36372,7 @@ pub unsafe fn _mm_fnmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c:
     let extractc: f32 = simd_extract(c, 0);
     let extractc = -extractc;
     let fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36476,8 +36404,7 @@ pub unsafe fn _mm_mask_fnmsub_round_ss<const ROUNDING: i32>(
         let extractc = -extractc;
         fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36510,8 +36437,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_ss<const ROUNDING: i32>(
         let extractc = -extractc;
         fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -36543,8 +36469,7 @@ pub unsafe fn _mm_mask3_fnmsub_round_ss<const ROUNDING: i32>(
         let extractc = -fnmsub;
         fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(c, 0, fnmsub);
-    transmute(r)
+    simd_insert(c, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -36573,8 +36498,7 @@ pub unsafe fn _mm_fnmsub_round_sd<const ROUNDING: i32>(
     let extractc: f64 = simd_extract(c, 0);
     let extractc = -extractc;
     let fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36606,8 +36530,7 @@ pub unsafe fn _mm_mask_fnmsub_round_sd<const ROUNDING: i32>(
         let extractc = -extractc;
         fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36640,8 +36563,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_sd<const ROUNDING: i32>(
         let extractc = -extractc;
         fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -36673,8 +36595,7 @@ pub unsafe fn _mm_mask3_fnmsub_round_sd<const ROUNDING: i32>(
         let extractc = -fnmsub;
         fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(c, 0, fnmsub);
-    transmute(r)
+    simd_insert(c, 0, fnmsub)
 }
 
 /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
@@ -37168,8 +37089,7 @@ pub unsafe fn _mm_maskz_cvt_roundsd_ss<const ROUNDING: i32>(
 pub unsafe fn _mm_cvt_roundss_si32<const ROUNDING: i32>(a: __m128) -> i32 {
     static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    let r = vcvtss2si(a, ROUNDING);
-    transmute(r)
+    vcvtss2si(a, ROUNDING)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
@@ -37188,8 +37108,7 @@ pub unsafe fn _mm_cvt_roundss_si32<const ROUNDING: i32>(a: __m128) -> i32 {
 pub unsafe fn _mm_cvt_roundss_i32<const ROUNDING: i32>(a: __m128) -> i32 {
     static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    let r = vcvtss2si(a, ROUNDING);
-    transmute(r)
+    vcvtss2si(a, ROUNDING)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
@@ -37208,8 +37127,7 @@ pub unsafe fn _mm_cvt_roundss_i32<const ROUNDING: i32>(a: __m128) -> i32 {
 pub unsafe fn _mm_cvt_roundss_u32<const ROUNDING: i32>(a: __m128) -> u32 {
     static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    let r = vcvtss2usi(a, ROUNDING);
-    transmute(r)
+    vcvtss2usi(a, ROUNDING)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
@@ -37219,7 +37137,7 @@ pub unsafe fn _mm_cvt_roundss_u32<const ROUNDING: i32>(a: __m128) -> u32 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtss2si))]
 pub unsafe fn _mm_cvtss_i32(a: __m128) -> i32 {
-    transmute(vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+    vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
@@ -37229,7 +37147,7 @@ pub unsafe fn _mm_cvtss_i32(a: __m128) -> i32 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtss2usi))]
 pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
-    transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+    vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
@@ -37248,8 +37166,7 @@ pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
 pub unsafe fn _mm_cvt_roundsd_si32<const ROUNDING: i32>(a: __m128d) -> i32 {
     static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
-    let r = vcvtsd2si(a, ROUNDING);
-    transmute(r)
+    vcvtsd2si(a, ROUNDING)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
@@ -37268,8 +37185,7 @@ pub unsafe fn _mm_cvt_roundsd_si32<const ROUNDING: i32>(a: __m128d) -> i32 {
 pub unsafe fn _mm_cvt_roundsd_i32<const ROUNDING: i32>(a: __m128d) -> i32 {
     static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
-    let r = vcvtsd2si(a, ROUNDING);
-    transmute(r)
+    vcvtsd2si(a, ROUNDING)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
@@ -37288,8 +37204,7 @@ pub unsafe fn _mm_cvt_roundsd_i32<const ROUNDING: i32>(a: __m128d) -> i32 {
 pub unsafe fn _mm_cvt_roundsd_u32<const ROUNDING: i32>(a: __m128d) -> u32 {
     static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
-    let r = vcvtsd2usi(a, ROUNDING);
-    transmute(r)
+    vcvtsd2usi(a, ROUNDING)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
@@ -37299,7 +37214,7 @@ pub unsafe fn _mm_cvt_roundsd_u32<const ROUNDING: i32>(a: __m128d) -> u32 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtsd2si))]
 pub unsafe fn _mm_cvtsd_i32(a: __m128d) -> i32 {
-    transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+    vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
@@ -37309,7 +37224,7 @@ pub unsafe fn _mm_cvtsd_i32(a: __m128d) -> i32 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtsd2usi))]
 pub unsafe fn _mm_cvtsd_u32(a: __m128d) -> u32 {
-    transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+    vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -37382,8 +37297,7 @@ pub unsafe fn _mm_cvt_roundu32_ss<const ROUNDING: i32>(a: __m128, b: u32) -> __m
 #[cfg_attr(test, assert_instr(vcvtsi2ss))]
 pub unsafe fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
     let b = b as f32;
-    let r = simd_insert(a, 0, b);
-    transmute(r)
+    simd_insert(a, 0, b)
 }
 
 /// Convert the signed 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -37394,8 +37308,7 @@ pub unsafe fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
 #[cfg_attr(test, assert_instr(vcvtsi2sd))]
 pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
     let b = b as f64;
-    let r = simd_insert(a, 0, b);
-    transmute(r)
+    simd_insert(a, 0, b)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -37409,8 +37322,7 @@ pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
 pub unsafe fn _mm_cvtt_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
     static_assert_sae!(SAE);
     let a = a.as_f32x4();
-    let r = vcvtss2si(a, SAE);
-    transmute(r)
+    vcvtss2si(a, SAE)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -37424,8 +37336,7 @@ pub unsafe fn _mm_cvtt_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
 pub unsafe fn _mm_cvtt_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
     static_assert_sae!(SAE);
     let a = a.as_f32x4();
-    let r = vcvtss2si(a, SAE);
-    transmute(r)
+    vcvtss2si(a, SAE)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
@@ -37439,8 +37350,7 @@ pub unsafe fn _mm_cvtt_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
 pub unsafe fn _mm_cvtt_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
     static_assert_sae!(SAE);
     let a = a.as_f32x4();
-    let r = vcvtss2usi(a, SAE);
-    transmute(r)
+    vcvtss2usi(a, SAE)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
@@ -37450,7 +37360,7 @@ pub unsafe fn _mm_cvtt_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtss2si))]
 pub unsafe fn _mm_cvttss_i32(a: __m128) -> i32 {
-    transmute(vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+    vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
@@ -37460,7 +37370,7 @@ pub unsafe fn _mm_cvttss_i32(a: __m128) -> i32 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtss2usi))]
 pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 {
-    transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+    vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -37474,8 +37384,7 @@ pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 {
 pub unsafe fn _mm_cvtt_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
     static_assert_sae!(SAE);
     let a = a.as_f64x2();
-    let r = vcvtsd2si(a, SAE);
-    transmute(r)
+    vcvtsd2si(a, SAE)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -37489,8 +37398,7 @@ pub unsafe fn _mm_cvtt_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
 pub unsafe fn _mm_cvtt_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
     static_assert_sae!(SAE);
     let a = a.as_f64x2();
-    let r = vcvtsd2si(a, SAE);
-    transmute(r)
+    vcvtsd2si(a, SAE)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
@@ -37504,8 +37412,7 @@ pub unsafe fn _mm_cvtt_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
 pub unsafe fn _mm_cvtt_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
     static_assert_sae!(SAE);
     let a = a.as_f64x2();
-    let r = vcvtsd2usi(a, SAE);
-    transmute(r)
+    vcvtsd2usi(a, SAE)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
@@ -37515,7 +37422,7 @@ pub unsafe fn _mm_cvtt_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtsd2si))]
 pub unsafe fn _mm_cvttsd_i32(a: __m128d) -> i32 {
-    transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+    vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
@@ -37525,7 +37432,7 @@ pub unsafe fn _mm_cvttsd_i32(a: __m128d) -> i32 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtsd2usi))]
 pub unsafe fn _mm_cvttsd_u32(a: __m128d) -> u32 {
-    transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+    vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -37536,8 +37443,7 @@ pub unsafe fn _mm_cvttsd_u32(a: __m128d) -> u32 {
 #[cfg_attr(test, assert_instr(vcvtusi2ss))]
 pub unsafe fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
     let b = b as f32;
-    let r = simd_insert(a, 0, b);
-    transmute(r)
+    simd_insert(a, 0, b)
 }
 
 /// Convert the unsigned 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -37548,8 +37454,7 @@ pub unsafe fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
 #[cfg_attr(test, assert_instr(vcvtusi2sd))]
 pub unsafe fn _mm_cvtu32_sd(a: __m128d, b: u32) -> __m128d {
     let b = b as f64;
-    let r = simd_insert(a, 0, b);
-    transmute(r)
+    simd_insert(a, 0, b)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
@@ -37565,8 +37470,7 @@ pub unsafe fn _mm_comi_round_ss<const IMM5: i32, const SAE: i32>(a: __m128, b: _
     static_assert_mantissas_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
-    let r = vcomiss(a, b, IMM5, SAE);
-    transmute(r)
+    vcomiss(a, b, IMM5, SAE)
 }
 
 /// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
@@ -37582,8 +37486,7 @@ pub unsafe fn _mm_comi_round_sd<const IMM5: i32, const SAE: i32>(a: __m128d, b:
     static_assert_mantissas_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
-    let r = vcomisd(a, b, IMM5, SAE);
-    transmute(r)
+    vcomisd(a, b, IMM5, SAE)
 }
 
 /// Equal
diff --git a/library/stdarch/crates/core_arch/src/x86/sse.rs b/library/stdarch/crates/core_arch/src/x86/sse.rs
index 3d4471ba3..6a2be0921 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse.rs
@@ -790,8 +790,7 @@ pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
 ///
 /// The result is rounded according to the current rounding mode. If the result
 /// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
-/// (`i32::MIN`) or an invalid operation floating point exception if
-/// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
+/// (`i32::MIN`).
 ///
 /// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
 ///
@@ -821,8 +820,7 @@ pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
 ///
 /// The result is rounded always using truncation (round towards zero). If the
 /// result cannot be represented as a 32 bit integer the result will be
-/// `0x8000_0000` (`i32::MIN`) or an invalid operation floating point
-/// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
+/// `0x8000_0000` (`i32::MIN`).
 ///
 /// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
 ///
@@ -1083,7 +1081,10 @@ pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(movmskps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
-    movmskps(a)
+    // Propagate the highest bit to the rest, because simd_bitmask
+    // requires all-1 or all-0.
+    let mask: i32x4 = simd_lt(transmute(a), i32x4::splat(0));
+    simd_bitmask::<i32x4, u8>(mask).into()
 }
 
 /// Construct a `__m128` with the lowest element read from `p` and the other
@@ -1365,6 +1366,15 @@ pub unsafe fn _mm_sfence() {
 
 /// Gets the unsigned 32-bit value of the MXCSR control and status register.
 ///
+/// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust
+/// floating-point operations may or may not result in this register getting updated with exception
+/// state, and the register can change between two invocations of this function even when no
+/// floating-point operations appear in the source code (since floating-point operations appearing
+/// earlier or later can be reordered).
+///
+/// If you need to perform some floating-point operations and check whether they raised an
+/// exception, use an inline assembly block for the entire sequence of operations.
+///
 /// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr)
@@ -1372,6 +1382,10 @@ pub unsafe fn _mm_sfence() {
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(stmxcsr))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _mm_getcsr() -> u32 {
     let mut result = 0_i32;
     stmxcsr(&mut result as *mut _ as *mut i8);
@@ -1401,6 +1415,16 @@ pub unsafe fn _mm_getcsr() -> u32 {
 /// * The *denormals-are-zero mode flag* turns all numbers which would be
 /// denormalized (exponent bits are all zeros) into zeros.
 ///
+/// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to
+/// **immediate Undefined Behavior**: Rust assumes that these are always in their default state and
+/// will optimize accordingly. This even applies when the register is altered and later reset to its
+/// original value without any floating-point operations appearing in the source code between those
+/// operations (since floating-point operations appearing earlier or later can be reordered).
+///
+/// If you need to perform some floating-point operations under a different masking flags, rounding
+/// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the
+/// original MXCSR register state before the end of the block.
+///
 /// ## Exception Flags
 ///
 /// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
@@ -1509,6 +1533,10 @@ pub unsafe fn _mm_getcsr() -> u32 {
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ldmxcsr))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _mm_setcsr(val: u32) {
     ldmxcsr(&val as *const _ as *const i8);
 }
@@ -1588,9 +1616,14 @@ pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK)
 #[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
     _mm_getcsr() & _MM_MASK_MASK
 }
@@ -1599,9 +1632,14 @@ pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE)
 #[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
     _mm_getcsr() & _MM_EXCEPT_MASK
 }
@@ -1610,9 +1648,14 @@ pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE)
 #[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
     _mm_getcsr() & _MM_FLUSH_ZERO_MASK
 }
@@ -1621,9 +1664,14 @@ pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE)
 #[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
     _mm_getcsr() & _MM_ROUND_MASK
 }
@@ -1632,9 +1680,14 @@ pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK)
 #[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
     _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x)
 }
@@ -1643,9 +1696,14 @@ pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE)
 #[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
     _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x)
 }
@@ -1654,9 +1712,14 @@ pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE)
 #[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
     let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x;
     // println!("setting csr={:x}", val);
@@ -1667,9 +1730,14 @@ pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE)
 #[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
     _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x)
 }
@@ -1820,8 +1888,6 @@ extern "C" {
     fn maxss(a: __m128, b: __m128) -> __m128;
     #[link_name = "llvm.x86.sse.max.ps"]
     fn maxps(a: __m128, b: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse.movmsk.ps"]
-    fn movmskps(a: __m128) -> i32;
     #[link_name = "llvm.x86.sse.cmp.ps"]
     fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
     #[link_name = "llvm.x86.sse.comieq.ss"]
@@ -1974,7 +2040,11 @@ mod tests {
         let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
         let r = _mm_rcp_ss(a);
         let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
-        assert_eq_m128(r, e);
+        let rel_err = 0.00048828125;
+        assert_approx_eq!(get_m128(r, 0), get_m128(e, 0), 2. * rel_err);
+        for i in 1..4 {
+            assert_eq!(get_m128(r, i), get_m128(e, i));
+        }
     }
 
     #[simd_test(enable = "sse")]
@@ -2055,6 +2125,17 @@ mod tests {
         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
         let r = _mm_max_ps(a, b);
         assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
+
+        // Check SSE-specific semantics for -0.0 handling.
+        let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
+        let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
+        let r1: [u8; 16] = transmute(_mm_max_ps(a, b));
+        let r2: [u8; 16] = transmute(_mm_max_ps(b, a));
+        let a: [u8; 16] = transmute(a);
+        let b: [u8; 16] = transmute(b);
+        assert_eq!(r1, b);
+        assert_eq!(r2, a);
+        assert_ne!(a, b); // sanity check that -0.0 is actually present
     }
 
     #[simd_test(enable = "sse")]
@@ -2098,12 +2179,12 @@ mod tests {
         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
         let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
         let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
-        let e: u32x4 = transmute(_mm_setr_ps(transmute(0u32), 2.0, 3.0, 4.0));
+        let e: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0), 2.0, 3.0, 4.0));
         assert_eq!(r, e);
 
         let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
         let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
-        let e2: u32x4 = transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0));
+        let e2: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0xffffffff), 2.0, 3.0, 4.0));
         assert_eq!(r2, e2);
     }
 
@@ -2119,15 +2200,15 @@ mod tests {
         let d1 = !0u32; // a.extract(0) < d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2143,15 +2224,15 @@ mod tests {
         let d1 = !0u32; // a.extract(0) <= d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2167,15 +2248,15 @@ mod tests {
         let d1 = 0u32; // a.extract(0) > d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2191,15 +2272,15 @@ mod tests {
         let d1 = 0u32; // a.extract(0) >= d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2215,15 +2296,15 @@ mod tests {
         let d1 = !0u32; // a.extract(0) != d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2244,15 +2325,15 @@ mod tests {
         let d1 = 0u32; // a.extract(0) >= d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2273,15 +2354,15 @@ mod tests {
         let d1 = 0u32; // a.extract(0) > d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2302,15 +2383,15 @@ mod tests {
         let d1 = !0u32; // a.extract(0) <= d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2331,15 +2412,15 @@ mod tests {
         let d1 = !0u32; // a.extract(0) < d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2355,15 +2436,15 @@ mod tests {
         let d1 = !0u32; // a.extract(0) ord d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2379,15 +2460,15 @@ mod tests {
         let d1 = 0u32; // a.extract(0) unord d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2766,7 +2847,9 @@ mod tests {
         }
     }
 
+    #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions
     #[simd_test(enable = "sse")]
+    #[cfg_attr(miri, ignore)] // Uses _mm_setcsr, which is not supported by Miri
     unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() {
         // If one of the arguments is a quiet NaN `comieq_ss` should signal an
         // Invalid Operation Exception while `ucomieq_ss` should not.
@@ -3072,7 +3155,7 @@ mod tests {
         let mut p = vals.as_mut_ptr();
 
         if (p as usize) & 0xf != 0 {
-            ofs = ((16 - (p as usize)) & 0xf) >> 2;
+            ofs = (16 - ((p as usize) & 0xf)) >> 2;
             p = p.add(ofs);
         }
 
@@ -3098,7 +3181,7 @@ mod tests {
 
         // Align p to 16-byte boundary
         if (p as usize) & 0xf != 0 {
-            ofs = ((16 - (p as usize)) & 0xf) >> 2;
+            ofs = (16 - ((p as usize) & 0xf)) >> 2;
             p = p.add(ofs);
         }
 
@@ -3124,7 +3207,7 @@ mod tests {
 
         // Align p to 16-byte boundary
         if (p as usize) & 0xf != 0 {
-            ofs = ((16 - (p as usize)) & 0xf) >> 2;
+            ofs = (16 - ((p as usize) & 0xf)) >> 2;
             p = p.add(ofs);
         }
 
@@ -3186,11 +3269,15 @@ mod tests {
     }
 
     #[simd_test(enable = "sse")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    #[cfg_attr(miri, ignore)]
     unsafe fn test_mm_sfence() {
         _mm_sfence();
     }
 
+    #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
     #[simd_test(enable = "sse")]
+    #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
     unsafe fn test_mm_getcsr_setcsr_1() {
         let saved_csr = _mm_getcsr();
 
@@ -3206,7 +3293,9 @@ mod tests {
         assert_eq_m128(r, exp); // first component is a denormalized f32
     }
 
+    #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
     #[simd_test(enable = "sse")]
+    #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
     unsafe fn test_mm_getcsr_setcsr_2() {
         // Same as _mm_setcsr_1 test, but with opposite flag value.
 
@@ -3224,7 +3313,9 @@ mod tests {
         assert_eq_m128(r, exp); // first component is a denormalized f32
     }
 
+    #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
     #[simd_test(enable = "sse")]
+    #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
     unsafe fn test_mm_getcsr_setcsr_underflow() {
         _MM_SET_EXCEPTION_STATE(0);
 
@@ -3263,6 +3354,9 @@ mod tests {
     }
 
     #[simd_test(enable = "sse")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
     unsafe fn test_mm_stream_ps() {
         let a = _mm_set1_ps(7.0);
         let mut mem = Memory { data: [-1.0; 4] };
diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs
index 3d572a1f5..7831ea743 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs
@@ -165,7 +165,10 @@ pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pavgb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pavgb(a.as_u8x16(), b.as_u8x16()))
+    let a = simd_cast::<_, u16x16>(a.as_u8x16());
+    let b = simd_cast::<_, u16x16>(b.as_u8x16());
+    let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
+    transmute(simd_cast::<_, u8x16>(r))
 }
 
 /// Averages packed unsigned 16-bit integers in `a` and `b`.
@@ -176,7 +179,10 @@ pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pavgw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pavgw(a.as_u16x8(), b.as_u16x8()))
+    let a = simd_cast::<_, u32x8>(a.as_u16x8());
+    let b = simd_cast::<_, u32x8>(b.as_u16x8());
+    let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
+    transmute(simd_cast::<_, u16x8>(r))
 }
 
 /// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
@@ -261,7 +267,10 @@ pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pmulhw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pmulhw(a.as_i16x8(), b.as_i16x8()))
+    let a = simd_cast::<_, i32x8>(a.as_i16x8());
+    let b = simd_cast::<_, i32x8>(b.as_i16x8());
+    let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
+    transmute(simd_cast::<i32x8, i16x8>(r))
 }
 
 /// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
@@ -275,7 +284,10 @@ pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pmulhuw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pmulhuw(a.as_u16x8(), b.as_u16x8()))
+    let a = simd_cast::<_, u32x8>(a.as_u16x8());
+    let b = simd_cast::<_, u32x8>(b.as_u16x8());
+    let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
+    transmute(simd_cast::<u32x8, u16x8>(r))
 }
 
 /// Multiplies the packed 16-bit integers in `a` and `b`.
@@ -303,7 +315,10 @@ pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pmuludq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pmuludq(a.as_u32x4(), b.as_u32x4()))
+    let a = a.as_u64x2();
+    let b = b.as_u64x2();
+    let mask = u64x2::splat(u32::MAX.into());
+    transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
 }
 
 /// Sum the absolute differences of packed unsigned 8-bit integers.
@@ -952,7 +967,7 @@ pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
 #[cfg_attr(test, assert_instr(cvtdq2ps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
-    cvtdq2ps(a.as_i32x4())
+    transmute(simd_cast::<_, f32x4>(a.as_i32x4()))
 }
 
 /// Converts packed single-precision (32-bit) floating-point elements in `a`
@@ -2240,7 +2255,9 @@ pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
 #[cfg_attr(test, assert_instr(cvtpd2ps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
-    cvtpd2ps(a)
+    let r = simd_cast::<_, f32x2>(a.as_f64x2());
+    let zero = f32x2::new(0.0, 0.0);
+    transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
 }
 
 /// Converts packed single-precision (32-bit) floating-point elements in `a` to
@@ -2253,7 +2270,8 @@ pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
 #[cfg_attr(test, assert_instr(cvtps2pd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtps_pd(a: __m128) -> __m128d {
-    cvtps2pd(a)
+    let a = a.as_f32x4();
+    transmute(simd_cast::<f32x2, f64x2>(simd_shuffle!(a, a, [0, 1])))
 }
 
 /// Converts packed double-precision (64-bit) floating-point elements in `a` to
@@ -2432,7 +2450,10 @@ pub unsafe fn _mm_setzero_pd() -> __m128d {
 #[cfg_attr(test, assert_instr(movmskpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 {
-    movmskpd(a)
+    // Propagate the highest bit to the rest, because simd_bitmask
+    // requires all-1 or all-0.
+    let mask: i64x2 = simd_lt(transmute(a), i64x2::splat(0));
+    simd_bitmask::<i64x2, u8>(mask).into()
 }
 
 /// Loads 128-bits (composed of 2 packed double-precision (64-bit)
@@ -2826,18 +2847,8 @@ extern "C" {
     fn lfence();
     #[link_name = "llvm.x86.sse2.mfence"]
     fn mfence();
-    #[link_name = "llvm.x86.sse2.pavg.b"]
-    fn pavgb(a: u8x16, b: u8x16) -> u8x16;
-    #[link_name = "llvm.x86.sse2.pavg.w"]
-    fn pavgw(a: u16x8, b: u16x8) -> u16x8;
     #[link_name = "llvm.x86.sse2.pmadd.wd"]
     fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
-    #[link_name = "llvm.x86.sse2.pmulh.w"]
-    fn pmulhw(a: i16x8, b: i16x8) -> i16x8;
-    #[link_name = "llvm.x86.sse2.pmulhu.w"]
-    fn pmulhuw(a: u16x8, b: u16x8) -> u16x8;
-    #[link_name = "llvm.x86.sse2.pmulu.dq"]
-    fn pmuludq(a: u32x4, b: u32x4) -> u64x2;
     #[link_name = "llvm.x86.sse2.psad.bw"]
     fn psadbw(a: u8x16, b: u8x16) -> u64x2;
     #[link_name = "llvm.x86.sse2.psll.w"]
@@ -2856,8 +2867,6 @@ extern "C" {
     fn psrld(a: i32x4, count: i32x4) -> i32x4;
     #[link_name = "llvm.x86.sse2.psrl.q"]
     fn psrlq(a: i64x2, count: i64x2) -> i64x2;
-    #[link_name = "llvm.x86.sse2.cvtdq2ps"]
-    fn cvtdq2ps(a: i32x4) -> __m128;
     #[link_name = "llvm.x86.sse2.cvtps2dq"]
     fn cvtps2dq(a: __m128) -> i32x4;
     #[link_name = "llvm.x86.sse2.maskmov.dqu"]
@@ -2908,12 +2917,6 @@ extern "C" {
     fn ucomigesd(a: __m128d, b: __m128d) -> i32;
     #[link_name = "llvm.x86.sse2.ucomineq.sd"]
     fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
-    #[link_name = "llvm.x86.sse2.movmsk.pd"]
-    fn movmskpd(a: __m128d) -> i32;
-    #[link_name = "llvm.x86.sse2.cvtpd2ps"]
-    fn cvtpd2ps(a: __m128d) -> __m128;
-    #[link_name = "llvm.x86.sse2.cvtps2pd"]
-    fn cvtps2pd(a: __m128) -> __m128d;
     #[link_name = "llvm.x86.sse2.cvtpd2dq"]
     fn cvtpd2dq(a: __m128d) -> i32x4;
     #[link_name = "llvm.x86.sse2.cvtsd2si"]
@@ -2956,11 +2959,15 @@ mod tests {
     }
 
     #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    #[cfg_attr(miri, ignore)]
     unsafe fn test_mm_lfence() {
         _mm_lfence();
     }
 
     #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    #[cfg_attr(miri, ignore)]
     unsafe fn test_mm_mfence() {
         _mm_mfence();
     }
@@ -3343,83 +3350,124 @@ mod tests {
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_slli_epi16() {
-        #[rustfmt::skip]
-        let a = _mm_setr_epi16(
-            0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0,
-        );
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
         let r = _mm_slli_epi16::<4>(a);
-
-        #[rustfmt::skip]
-        let e = _mm_setr_epi16(
-            0xFFF0 as u16 as i16, 0xFFF0 as u16 as i16, 0x0FF0, 0x00F0,
-            0, 0, 0, 0,
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
         );
-        assert_eq_m128i(r, e);
+        let r = _mm_slli_epi16::<16>(a);
+        assert_eq_m128i(r, _mm_set1_epi16(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_sll_epi16() {
-        let a = _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0);
-        let r = _mm_sll_epi16(a, _mm_setr_epi16(4, 0, 0, 0, 0, 0, 0, 0));
-        assert_eq_m128i(r, _mm_setr_epi16(0xFF0, 0, 0, 0, 0, 0, 0, 0));
-        let r = _mm_sll_epi16(a, _mm_setr_epi16(0, 0, 0, 0, 4, 0, 0, 0));
-        assert_eq_m128i(r, _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0));
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
+        );
+        let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_set1_epi16(0));
+        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi16(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_slli_epi32() {
-        let r = _mm_slli_epi32::<4>(_mm_set1_epi32(0xFFFF));
-        assert_eq_m128i(r, _mm_set1_epi32(0xFFFF0));
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_slli_epi32::<4>(a);
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
+        let r = _mm_slli_epi32::<32>(a);
+        assert_eq_m128i(r, _mm_set1_epi32(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_sll_epi32() {
-        let a = _mm_set1_epi32(0xFFFF);
-        let b = _mm_setr_epi32(4, 0, 0, 0);
-        let r = _mm_sll_epi32(a, b);
-        assert_eq_m128i(r, _mm_set1_epi32(0xFFFF0));
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
+        let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m128i(r, _mm_set1_epi32(0));
+        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi32(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_slli_epi64() {
-        let r = _mm_slli_epi64::<4>(_mm_set1_epi64x(0xFFFFFFFF));
-        assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFFF0));
+        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm_slli_epi64::<4>(a);
+        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
+        let r = _mm_slli_epi64::<64>(a);
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_sll_epi64() {
-        let a = _mm_set1_epi64x(0xFFFFFFFF);
-        let b = _mm_setr_epi64x(4, 0);
-        let r = _mm_sll_epi64(a, b);
-        assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFFF0));
+        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
+        let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
+        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_srai_epi16() {
-        let r = _mm_srai_epi16::<1>(_mm_set1_epi16(-1));
-        assert_eq_m128i(r, _mm_set1_epi16(-1));
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+        let r = _mm_srai_epi16::<4>(a);
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
+        );
+        let r = _mm_srai_epi16::<16>(a);
+        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_sra_epi16() {
-        let a = _mm_set1_epi16(-1);
-        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
-        let r = _mm_sra_epi16(a, b);
-        assert_eq_m128i(r, _mm_set1_epi16(-1));
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
+        );
+        let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
+        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_srai_epi32() {
-        let r = _mm_srai_epi32::<1>(_mm_set1_epi32(-1));
-        assert_eq_m128i(r, _mm_set1_epi32(-1));
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_srai_epi32::<4>(a);
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
+        let r = _mm_srai_epi32::<32>(a);
+        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_sra_epi32() {
-        let a = _mm_set1_epi32(-1);
-        let b = _mm_setr_epi32(1, 0, 0, 0);
-        let r = _mm_sra_epi32(a, b);
-        assert_eq_m128i(r, _mm_set1_epi32(-1));
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
+        let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
+        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
     }
 
     #[simd_test(enable = "sse2")]
@@ -3453,53 +3501,74 @@ mod tests {
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_srli_epi16() {
-        #[rustfmt::skip]
-        let a = _mm_setr_epi16(
-            0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0,
-        );
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
         let r = _mm_srli_epi16::<4>(a);
-        #[rustfmt::skip]
-        let e = _mm_setr_epi16(
-            0xFFF as u16 as i16, 0xFF as u16 as i16, 0xF, 0, 0, 0, 0, 0,
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
         );
-        assert_eq_m128i(r, e);
+        let r = _mm_srli_epi16::<16>(a);
+        assert_eq_m128i(r, _mm_set1_epi16(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_srl_epi16() {
-        let a = _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0);
-        let r = _mm_srl_epi16(a, _mm_setr_epi16(4, 0, 0, 0, 0, 0, 0, 0));
-        assert_eq_m128i(r, _mm_setr_epi16(0xF, 0, 0, 0, 0, 0, 0, 0));
-        let r = _mm_srl_epi16(a, _mm_setr_epi16(0, 0, 0, 0, 4, 0, 0, 0));
-        assert_eq_m128i(r, _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0));
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
+        );
+        let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_set1_epi16(0));
+        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi16(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_srli_epi32() {
-        let r = _mm_srli_epi32::<4>(_mm_set1_epi32(0xFFFF));
-        assert_eq_m128i(r, _mm_set1_epi32(0xFFF));
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_srli_epi32::<4>(a);
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
+        let r = _mm_srli_epi32::<32>(a);
+        assert_eq_m128i(r, _mm_set1_epi32(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_srl_epi32() {
-        let a = _mm_set1_epi32(0xFFFF);
-        let b = _mm_setr_epi32(4, 0, 0, 0);
-        let r = _mm_srl_epi32(a, b);
-        assert_eq_m128i(r, _mm_set1_epi32(0xFFF));
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
+        let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m128i(r, _mm_set1_epi32(0));
+        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi32(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_srli_epi64() {
-        let r = _mm_srli_epi64::<4>(_mm_set1_epi64x(0xFFFFFFFF));
-        assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFF));
+        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm_srli_epi64::<4>(a);
+        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
+        let r = _mm_srli_epi64::<64>(a);
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_srl_epi64() {
-        let a = _mm_set1_epi64x(0xFFFFFFFF);
-        let b = _mm_setr_epi64x(4, 0);
-        let r = _mm_srl_epi64(a, b);
-        assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFF));
+        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
+        let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
+        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
     }
 
     #[simd_test(enable = "sse2")]
@@ -3766,6 +3835,9 @@ mod tests {
     }
 
     #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
     unsafe fn test_mm_maskmoveu_si128() {
         let a = _mm_set1_epi8(9);
         #[rustfmt::skip]
@@ -3804,6 +3876,9 @@ mod tests {
     }
 
     #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
     unsafe fn test_mm_stream_si128() {
         let a = _mm_setr_epi32(1, 2, 3, 4);
         let mut r = _mm_undefined_si128();
@@ -3812,6 +3887,9 @@ mod tests {
     }
 
     #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
     unsafe fn test_mm_stream_si32() {
         let a: i32 = 7;
         let mut mem = boxed::Box::<i32>::new(-1);
@@ -4055,6 +4133,17 @@ mod tests {
         let b = _mm_setr_pd(5.0, 10.0);
         let r = _mm_max_pd(a, b);
         assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
+
+        // Check SSE(2)-specific semantics for -0.0 handling.
+        let a = _mm_setr_pd(-0.0, 0.0);
+        let b = _mm_setr_pd(0.0, 0.0);
+        let r1: [u8; 16] = transmute(_mm_max_pd(a, b));
+        let r2: [u8; 16] = transmute(_mm_max_pd(b, a));
+        let a: [u8; 16] = transmute(a);
+        let b: [u8; 16] = transmute(b);
+        assert_eq!(r1, b);
+        assert_eq!(r2, a);
+        assert_ne!(a, b); // sanity check that -0.0 is actually present
     }
 
     #[simd_test(enable = "sse2")]
@@ -4071,6 +4160,17 @@ mod tests {
         let b = _mm_setr_pd(5.0, 10.0);
         let r = _mm_min_pd(a, b);
         assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
+
+        // Check SSE(2)-specific semantics for -0.0 handling.
+        let a = _mm_setr_pd(-0.0, 0.0);
+        let b = _mm_setr_pd(0.0, 0.0);
+        let r1: [u8; 16] = transmute(_mm_min_pd(a, b));
+        let r2: [u8; 16] = transmute(_mm_min_pd(b, a));
+        let a: [u8; 16] = transmute(a);
+        let b: [u8; 16] = transmute(b);
+        assert_eq!(r1, b);
+        assert_eq!(r2, a);
+        assert_ne!(a, b); // sanity check that -0.0 is actually present
     }
 
     #[simd_test(enable = "sse2")]
@@ -4158,7 +4258,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpeq_sd() {
         let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
-        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4166,7 +4266,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmplt_sd() {
         let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
-        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4174,7 +4274,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmple_sd() {
         let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
-        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4182,7 +4282,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpgt_sd() {
         let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
-        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4190,7 +4290,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpge_sd() {
         let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
-        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4198,7 +4298,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpord_sd() {
         let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
-        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4206,7 +4306,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpunord_sd() {
         let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
-        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4214,7 +4314,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpneq_sd() {
         let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
-        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4222,7 +4322,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpnlt_sd() {
         let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
-        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4230,7 +4330,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpnle_sd() {
         let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
-        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4238,7 +4338,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpngt_sd() {
         let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
-        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4246,7 +4346,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpnge_sd() {
         let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
-        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4478,6 +4578,9 @@ mod tests {
     }
 
     #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
     unsafe fn test_mm_stream_pd() {
         #[repr(align(128))]
         struct Memory {
diff --git a/library/stdarch/crates/core_arch/src/x86/sse3.rs b/library/stdarch/crates/core_arch/src/x86/sse3.rs
index 092a8d9cd..df0d78e5b 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse3.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse3.rs
@@ -1,7 +1,7 @@
 //! Streaming SIMD Extensions 3 (SSE3)
 
 use crate::{
-    core_arch::{simd::*, simd_llvm::simd_shuffle, x86::*},
+    core_arch::{simd::*, simd_llvm::*, x86::*},
     mem::transmute,
 };
 
@@ -17,7 +17,11 @@ use stdarch_test::assert_instr;
 #[cfg_attr(test, assert_instr(addsubps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
-    addsubps(a, b)
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let add = simd_add(a, b);
+    let sub = simd_sub(a, b);
+    simd_shuffle!(add, sub, [4, 1, 6, 3])
 }
 
 /// Alternatively add and subtract packed double-precision (64-bit)
@@ -29,7 +33,11 @@ pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(addsubpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
-    addsubpd(a, b)
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let add = simd_add(a, b);
+    let sub = simd_sub(a, b);
+    simd_shuffle!(add, sub, [2, 1])
 }
 
 /// Horizontally adds adjacent pairs of double-precision (64-bit)
@@ -143,10 +151,6 @@ pub unsafe fn _mm_moveldup_ps(a: __m128) -> __m128 {
 
 #[allow(improper_ctypes)]
 extern "C" {
-    #[link_name = "llvm.x86.sse3.addsub.ps"]
-    fn addsubps(a: __m128, b: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse3.addsub.pd"]
-    fn addsubpd(a: __m128d, b: __m128d) -> __m128d;
     #[link_name = "llvm.x86.sse3.hadd.pd"]
     fn haddpd(a: __m128d, b: __m128d) -> __m128d;
     #[link_name = "llvm.x86.sse3.hadd.ps"]
diff --git a/library/stdarch/crates/core_arch/src/x86/sse41.rs b/library/stdarch/crates/core_arch/src/x86/sse41.rs
index 7ba86e5f7..6d33238b0 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse41.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse41.rs
@@ -62,7 +62,8 @@ pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTI
 #[cfg_attr(test, assert_instr(pblendvb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
-    transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
+    let mask: i8x16 = simd_lt(mask.as_i8x16(), i8x16::splat(0));
+    transmute(simd_select(mask, b.as_i8x16(), a.as_i8x16()))
 }
 
 /// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
@@ -74,15 +75,25 @@ pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16)
 #[inline]
 #[target_feature(enable = "sse4.1")]
-// Note: LLVM7 prefers the single-precision floating-point domain when possible
-// see https://bugs.llvm.org/show_bug.cgi?id=38195
-// #[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xF0))]
-#[cfg_attr(test, assert_instr(blendps, IMM8 = 0xF0))]
+#[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xB1))]
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(pblendw(a.as_i16x8(), b.as_i16x8(), IMM8 as u8))
+    transmute::<i16x8, _>(simd_shuffle!(
+        a.as_i16x8(),
+        b.as_i16x8(),
+        [
+            [0, 8][IMM8 as usize & 1],
+            [1, 9][(IMM8 >> 1) as usize & 1],
+            [2, 10][(IMM8 >> 2) as usize & 1],
+            [3, 11][(IMM8 >> 3) as usize & 1],
+            [4, 12][(IMM8 >> 4) as usize & 1],
+            [5, 13][(IMM8 >> 5) as usize & 1],
+            [6, 14][(IMM8 >> 6) as usize & 1],
+            [7, 15][(IMM8 >> 7) as usize & 1],
+        ]
+    ))
 }
 
 /// Blend packed double-precision (64-bit) floating-point elements from `a`
@@ -94,7 +105,8 @@ pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128
 #[cfg_attr(test, assert_instr(blendvpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
-    blendvpd(a, b, mask)
+    let mask: i64x2 = simd_lt(transmute::<_, i64x2>(mask), i64x2::splat(0));
+    transmute(simd_select(mask, b.as_f64x2(), a.as_f64x2()))
 }
 
 /// Blend packed single-precision (32-bit) floating-point elements from `a`
@@ -106,7 +118,8 @@ pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(blendvps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
-    blendvps(a, b, mask)
+    let mask: i32x4 = simd_lt(transmute::<_, i32x4>(mask), i32x4::splat(0));
+    transmute(simd_select(mask, b.as_f32x4(), a.as_f32x4()))
 }
 
 /// Blend packed double-precision (64-bit) floating-point elements from `a`
@@ -123,7 +136,11 @@ pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
     static_assert_uimm_bits!(IMM2, 2);
-    blendpd(a, b, IMM2 as u8)
+    transmute::<f64x2, _>(simd_shuffle!(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        [[0, 2][IMM2 as usize & 1], [1, 3][(IMM2 >> 1) as usize & 1]]
+    ))
 }
 
 /// Blend packed single-precision (32-bit) floating-point elements from `a`
@@ -137,7 +154,16 @@ pub unsafe fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
     static_assert_uimm_bits!(IMM4, 4);
-    blendps(a, b, IMM4 as u8)
+    transmute::<f32x4, _>(simd_shuffle!(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        [
+            [0, 4][IMM4 as usize & 1],
+            [1, 5][(IMM4 >> 1) as usize & 1],
+            [2, 6][(IMM4 >> 2) as usize & 1],
+            [3, 7][(IMM4 >> 3) as usize & 1],
+        ]
+    ))
 }
 
 /// Extracts a single-precision (32-bit) floating-point element from `a`,
@@ -175,7 +201,7 @@ pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
     static_assert_uimm_bits!(IMM8, 2);
-    transmute(simd_extract::<_, f32>(a, IMM8 as u32))
+    simd_extract::<_, f32>(a, IMM8 as u32).to_bits() as i32
 }
 
 /// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
@@ -923,7 +949,9 @@ pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pmuldq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pmuldq(a.as_i32x4(), b.as_i32x4()))
+    let a = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2()));
+    let b = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2()));
+    transmute(simd_mul(a, b))
 }
 
 /// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
@@ -1124,18 +1152,6 @@ pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
 
 #[allow(improper_ctypes)]
 extern "C" {
-    #[link_name = "llvm.x86.sse41.pblendvb"]
-    fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
-    #[link_name = "llvm.x86.sse41.blendvpd"]
-    fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d;
-    #[link_name = "llvm.x86.sse41.blendvps"]
-    fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse41.blendpd"]
-    fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d;
-    #[link_name = "llvm.x86.sse41.blendps"]
-    fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128;
-    #[link_name = "llvm.x86.sse41.pblendw"]
-    fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
     #[link_name = "llvm.x86.sse41.insertps"]
     fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
     #[link_name = "llvm.x86.sse41.packusdw"]
@@ -1154,8 +1170,6 @@ extern "C" {
     fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
     #[link_name = "llvm.x86.sse41.phminposuw"]
     fn phminposuw(a: u16x8) -> u16x8;
-    #[link_name = "llvm.x86.sse41.pmuldq"]
-    fn pmuldq(a: i32x4, b: i32x4) -> i64x2;
     #[link_name = "llvm.x86.sse41.mpsadbw"]
     fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
     #[link_name = "llvm.x86.sse41.ptestz"]
@@ -1245,9 +1259,9 @@ mod tests {
     #[simd_test(enable = "sse4.1")]
     unsafe fn test_mm_extract_ps() {
         let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
-        let r: f32 = transmute(_mm_extract_ps::<1>(a));
+        let r: f32 = f32::from_bits(_mm_extract_ps::<1>(a) as u32);
         assert_eq!(r, 1.0);
-        let r: f32 = transmute(_mm_extract_ps::<3>(a));
+        let r: f32 = f32::from_bits(_mm_extract_ps::<3>(a) as u32);
         assert_eq!(r, 3.0);
     }
 
@@ -1668,6 +1682,7 @@ mod tests {
         assert_eq_m128(r, e);
     }
 
+    #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions
     #[simd_test(enable = "sse4.1")]
     unsafe fn test_mm_round_sd() {
         let a = _mm_setr_pd(1.5, 3.5);
@@ -1680,6 +1695,7 @@ mod tests {
         assert_eq_m128d(r, e);
     }
 
+    #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions
     #[simd_test(enable = "sse4.1")]
     unsafe fn test_mm_round_ss() {
         let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
diff --git a/library/stdarch/crates/core_arch/src/x86/test.rs b/library/stdarch/crates/core_arch/src/x86/test.rs
index ec4298033..50b2d93be 100644
--- a/library/stdarch/crates/core_arch/src/x86/test.rs
+++ b/library/stdarch/crates/core_arch/src/x86/test.rs
@@ -3,11 +3,13 @@
 use crate::core_arch::x86::*;
 use std::mem::transmute;
 
+#[track_caller]
 #[target_feature(enable = "sse2")]
 pub unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
     assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
 }
 
+#[track_caller]
 #[target_feature(enable = "sse2")]
 pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
     if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
@@ -20,6 +22,7 @@ pub unsafe fn get_m128d(a: __m128d, idx: usize) -> f64 {
     transmute::<_, [f64; 2]>(a)[idx]
 }
 
+#[track_caller]
 #[target_feature(enable = "sse")]
 pub unsafe fn assert_eq_m128(a: __m128, b: __m128) {
     let r = _mm_cmpeq_ps(a, b);
@@ -40,11 +43,13 @@ pub unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
     _mm_set_epi64x(b, a)
 }
 
+#[track_caller]
 #[target_feature(enable = "avx")]
 pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
     assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b))
 }
 
+#[track_caller]
 #[target_feature(enable = "avx")]
 pub unsafe fn assert_eq_m256d(a: __m256d, b: __m256d) {
     let cmp = _mm256_cmp_pd::<_CMP_EQ_OQ>(a, b);
@@ -58,6 +63,7 @@ pub unsafe fn get_m256d(a: __m256d, idx: usize) -> f64 {
     transmute::<_, [f64; 4]>(a)[idx]
 }
 
+#[track_caller]
 #[target_feature(enable = "avx")]
 pub unsafe fn assert_eq_m256(a: __m256, b: __m256) {
     let cmp = _mm256_cmp_ps::<_CMP_EQ_OQ>(a, b);
@@ -125,10 +131,12 @@ mod x86_polyfill {
 }
 pub use self::x86_polyfill::*;
 
+#[track_caller]
 pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) {
     assert_eq!(transmute::<_, [i32; 16]>(a), transmute::<_, [i32; 16]>(b))
 }
 
+#[track_caller]
 pub unsafe fn assert_eq_m512(a: __m512, b: __m512) {
     let cmp = _mm512_cmp_ps_mask::<_CMP_EQ_OQ>(a, b);
     if cmp != 0b11111111_11111111 {
@@ -136,6 +144,7 @@ pub unsafe fn assert_eq_m512(a: __m512, b: __m512) {
     }
 }
 
+#[track_caller]
 pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) {
     let cmp = _mm512_cmp_pd_mask::<_CMP_EQ_OQ>(a, b);
     if cmp != 0b11111111 {
diff --git a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs
index 68f332767..bace11d13 100644
--- a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs
@@ -33,7 +33,7 @@ pub unsafe fn _mm_cvtss_i64(a: __m128) -> i64 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtss2usi))]
 pub unsafe fn _mm_cvtss_u64(a: __m128) -> u64 {
-    transmute(vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+    vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.
@@ -43,7 +43,7 @@ pub unsafe fn _mm_cvtss_u64(a: __m128) -> u64 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtsd2usi))]
 pub unsafe fn _mm_cvtsd_u64(a: __m128d) -> u64 {
-    transmute(vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+    vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -54,8 +54,7 @@ pub unsafe fn _mm_cvtsd_u64(a: __m128d) -> u64 {
 #[cfg_attr(test, assert_instr(vcvtsi2ss))]
 pub unsafe fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 {
     let b = b as f32;
-    let r = simd_insert(a, 0, b);
-    transmute(r)
+    simd_insert(a, 0, b)
 }
 
 /// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -66,8 +65,7 @@ pub unsafe fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 {
 #[cfg_attr(test, assert_instr(vcvtsi2sd))]
 pub unsafe fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d {
     let b = b as f64;
-    let r = simd_insert(a, 0, b);
-    transmute(r)
+    simd_insert(a, 0, b)
 }
 
 /// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -78,8 +76,7 @@ pub unsafe fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d {
 #[cfg_attr(test, assert_instr(vcvtusi2ss))]
 pub unsafe fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 {
     let b = b as f32;
-    let r = simd_insert(a, 0, b);
-    transmute(r)
+    simd_insert(a, 0, b)
 }
 
 /// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -90,8 +87,7 @@ pub unsafe fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 {
 #[cfg_attr(test, assert_instr(vcvtusi2sd))]
 pub unsafe fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d {
     let b = b as f64;
-    let r = simd_insert(a, 0, b);
-    transmute(r)
+    simd_insert(a, 0, b)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
@@ -101,7 +97,7 @@ pub unsafe fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtsd2si))]
 pub unsafe fn _mm_cvttsd_i64(a: __m128d) -> i64 {
-    transmute(vcvtsd2si64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+    vcvtsd2si64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
@@ -111,7 +107,7 @@ pub unsafe fn _mm_cvttsd_i64(a: __m128d) -> i64 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtsd2usi))]
 pub unsafe fn _mm_cvttsd_u64(a: __m128d) -> u64 {
-    transmute(vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+    vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
@@ -121,7 +117,7 @@ pub unsafe fn _mm_cvttsd_u64(a: __m128d) -> u64 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtss2si))]
 pub unsafe fn _mm_cvttss_i64(a: __m128) -> i64 {
-    transmute(vcvtss2si64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+    vcvtss2si64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
@@ -131,7 +127,7 @@ pub unsafe fn _mm_cvttss_i64(a: __m128) -> i64 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtss2usi))]
 pub unsafe fn _mm_cvttss_u64(a: __m128) -> u64 {
-    transmute(vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+    vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -270,8 +266,7 @@ pub unsafe fn _mm_cvt_roundu64_ss<const ROUNDING: i32>(a: __m128, b: u64) -> __m
 pub unsafe fn _mm_cvt_roundsd_si64<const ROUNDING: i32>(a: __m128d) -> i64 {
     static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
-    let r = vcvtsd2si64(a, ROUNDING);
-    transmute(r)
+    vcvtsd2si64(a, ROUNDING)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
@@ -290,8 +285,7 @@ pub unsafe fn _mm_cvt_roundsd_si64<const ROUNDING: i32>(a: __m128d) -> i64 {
 pub unsafe fn _mm_cvt_roundsd_i64<const ROUNDING: i32>(a: __m128d) -> i64 {
     static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
-    let r = vcvtsd2si64(a, ROUNDING);
-    transmute(r)
+    vcvtsd2si64(a, ROUNDING)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\
@@ -310,8 +304,7 @@ pub unsafe fn _mm_cvt_roundsd_i64<const ROUNDING: i32>(a: __m128d) -> i64 {
 pub unsafe fn _mm_cvt_roundsd_u64<const ROUNDING: i32>(a: __m128d) -> u64 {
     static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
-    let r = vcvtsd2usi64(a, ROUNDING);
-    transmute(r)
+    vcvtsd2usi64(a, ROUNDING)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
@@ -330,8 +323,7 @@ pub unsafe fn _mm_cvt_roundsd_u64<const ROUNDING: i32>(a: __m128d) -> u64 {
 pub unsafe fn _mm_cvt_roundss_si64<const ROUNDING: i32>(a: __m128) -> i64 {
     static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    let r = vcvtss2si64(a, ROUNDING);
-    transmute(r)
+    vcvtss2si64(a, ROUNDING)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
@@ -350,8 +342,7 @@ pub unsafe fn _mm_cvt_roundss_si64<const ROUNDING: i32>(a: __m128) -> i64 {
 pub unsafe fn _mm_cvt_roundss_i64<const ROUNDING: i32>(a: __m128) -> i64 {
     static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    let r = vcvtss2si64(a, ROUNDING);
-    transmute(r)
+    vcvtss2si64(a, ROUNDING)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\
@@ -370,8 +361,7 @@ pub unsafe fn _mm_cvt_roundss_i64<const ROUNDING: i32>(a: __m128) -> i64 {
 pub unsafe fn _mm_cvt_roundss_u64<const ROUNDING: i32>(a: __m128) -> u64 {
     static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    let r = vcvtss2usi64(a, ROUNDING);
-    transmute(r)
+    vcvtss2usi64(a, ROUNDING)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
@@ -385,8 +375,7 @@ pub unsafe fn _mm_cvt_roundss_u64<const ROUNDING: i32>(a: __m128) -> u64 {
 pub unsafe fn _mm_cvtt_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
     static_assert_sae!(SAE);
     let a = a.as_f64x2();
-    let r = vcvtsd2si64(a, SAE);
-    transmute(r)
+    vcvtsd2si64(a, SAE)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
@@ -400,8 +389,7 @@ pub unsafe fn _mm_cvtt_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
 pub unsafe fn _mm_cvtt_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
     static_assert_sae!(SAE);
     let a = a.as_f64x2();
-    let r = vcvtsd2si64(a, SAE);
-    transmute(r)
+    vcvtsd2si64(a, SAE)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\
@@ -415,8 +403,7 @@ pub unsafe fn _mm_cvtt_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
 pub unsafe fn _mm_cvtt_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
     static_assert_sae!(SAE);
     let a = a.as_f64x2();
-    let r = vcvtsd2usi64(a, SAE);
-    transmute(r)
+    vcvtsd2usi64(a, SAE)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
@@ -430,8 +417,7 @@ pub unsafe fn _mm_cvtt_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
 pub unsafe fn _mm_cvtt_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
     static_assert_sae!(SAE);
     let a = a.as_f32x4();
-    let r = vcvtss2si64(a, SAE);
-    transmute(r)
+    vcvtss2si64(a, SAE)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
@@ -445,8 +431,7 @@ pub unsafe fn _mm_cvtt_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
 pub unsafe fn _mm_cvtt_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
     static_assert_sae!(SAE);
     let a = a.as_f32x4();
-    let r = vcvtss2si64(a, SAE);
-    transmute(r)
+    vcvtss2si64(a, SAE)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\
@@ -460,8 +445,7 @@ pub unsafe fn _mm_cvtt_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
 pub unsafe fn _mm_cvtt_roundss_u64<const SAE: i32>(a: __m128) -> u64 {
     static_assert_sae!(SAE);
     let a = a.as_f32x4();
-    let r = vcvtss2usi64(a, SAE);
-    transmute(r)
+    vcvtss2usi64(a, SAE)
 }
 
 #[allow(improper_ctypes)]
diff --git a/library/stdarch/crates/core_arch/src/x86_64/sse2.rs b/library/stdarch/crates/core_arch/src/x86_64/sse2.rs
index bf2394eba..9619cb748 100644
--- a/library/stdarch/crates/core_arch/src/x86_64/sse2.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/sse2.rs
@@ -181,6 +181,9 @@ mod tests {
     }
 
     #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
     unsafe fn test_mm_stream_si64() {
         let a: i64 = 7;
         let mut mem = boxed::Box::<i64>::new(-1);
diff --git a/library/stdarch/crates/intrinsic-test/Cargo.toml b/library/stdarch/crates/intrinsic-test/Cargo.toml
index d977dd659..c7a18f77f 100644
--- a/library/stdarch/crates/intrinsic-test/Cargo.toml
+++ b/library/stdarch/crates/intrinsic-test/Cargo.toml
@@ -12,10 +12,10 @@ lazy_static = "1.4.0"
 serde = { version = "1", features = ["derive"] }
 serde_json = "1.0"
 csv = "1.1"
-clap = "2.33.3"
+clap = { version = "4.4", features = ["derive"] }
 regex = "1.4.2"
 log = "0.4.11"
-pretty_env_logger = "0.4.0"
+pretty_env_logger = "0.5.0"
 rayon = "1.5.0"
 diff = "0.1.12"
-itertools = "0.10.1"
+itertools = "0.11.0"
diff --git a/library/stdarch/crates/intrinsic-test/README.md b/library/stdarch/crates/intrinsic-test/README.md
index 8a8ddab40..2b3f0c75a 100644
--- a/library/stdarch/crates/intrinsic-test/README.md
+++ b/library/stdarch/crates/intrinsic-test/README.md
@@ -4,15 +4,17 @@ each produces the same result from random inputs.
 # Usage
 ```
 USAGE:
-    intrinsic-test [OPTIONS] <INPUT>
+    intrinsic-test [FLAGS] [OPTIONS] <INPUT>
 
 FLAGS:
+        --a32        Run tests for A32 instrinsics instead of A64
     -h, --help       Prints help information
     -V, --version    Prints version information
 
 OPTIONS:
         --cppcompiler <CPPCOMPILER>    The C++ compiler to use for compiling the c++ code [default: clang++]
         --runner <RUNNER>              Run the C programs under emulation with this command
+        --skip <SKIP>                  Filename for a list of intrinsics to skip (one per line)
         --toolchain <TOOLCHAIN>        The rust toolchain to use for building the rust code
 
 ARGS:
diff --git a/library/stdarch/crates/intrinsic-test/src/json_parser.rs b/library/stdarch/crates/intrinsic-test/src/json_parser.rs
index bc6fa4a9e..8b3c7869c 100644
--- a/library/stdarch/crates/intrinsic-test/src/json_parser.rs
+++ b/library/stdarch/crates/intrinsic-test/src/json_parser.rs
@@ -1,4 +1,5 @@
 use std::collections::HashMap;
+use std::path::Path;
 
 use serde::Deserialize;
 
@@ -41,7 +42,7 @@ struct JsonIntrinsic {
     architectures: Vec<String>,
 }
 
-pub fn get_neon_intrinsics(filename: &str) -> Result<Vec<Intrinsic>, Box<dyn std::error::Error>> {
+pub fn get_neon_intrinsics(filename: &Path) -> Result<Vec<Intrinsic>, Box<dyn std::error::Error>> {
     let file = std::fs::File::open(filename)?;
     let reader = std::io::BufReader::new(file);
     let json: Vec<JsonIntrinsic> = serde_json::from_reader(reader).expect("Couldn't parse JSON");
diff --git a/library/stdarch/crates/intrinsic-test/src/main.rs b/library/stdarch/crates/intrinsic-test/src/main.rs
index 76d2da3ab..15bc021c7 100644
--- a/library/stdarch/crates/intrinsic-test/src/main.rs
+++ b/library/stdarch/crates/intrinsic-test/src/main.rs
@@ -4,9 +4,9 @@ extern crate log;
 
 use std::fs::File;
 use std::io::Write;
+use std::path::PathBuf;
 use std::process::Command;
 
-use clap::{App, Arg};
 use intrinsic::Intrinsic;
 use itertools::Itertools;
 use rayon::prelude::*;
@@ -320,58 +320,47 @@ path = "{intrinsic}/main.rs""#,
     }
 }
 
+/// Intrinsic test tool
+#[derive(clap::Parser)]
+#[command(
+    name = "Intrinsic test tool",
+    about = "Generates Rust and C programs for intrinsics and compares the output"
+)]
+struct Cli {
+    /// The input file containing the intrinsics
+    input: PathBuf,
+
+    /// The rust toolchain to use for building the rust code
+    #[arg(long)]
+    toolchain: Option<String>,
+
+    /// The C++ compiler to use for compiling the c++ code
+    #[arg(long, default_value_t = String::from("clang++"))]
+    cppcompiler: String,
+
+    /// Run the C programs under emulation with this command
+    #[arg(long)]
+    runner: Option<String>,
+
+    /// Filename for a list of intrinsics to skip (one per line)
+    #[arg(long)]
+    skip: Option<PathBuf>,
+
+    /// Run tests for A32 instrinsics instead of A64
+    #[arg(long)]
+    a32: bool,
+}
+
 fn main() {
     pretty_env_logger::init();
 
-    let matches = App::new("Intrinsic test tool")
-        .about("Generates Rust and C programs for intrinsics and compares the output")
-        .arg(
-            Arg::with_name("INPUT")
-                .help("The input file containing the intrinsics")
-                .required(true)
-                .index(1),
-        )
-        .arg(
-            Arg::with_name("TOOLCHAIN")
-                .takes_value(true)
-                .long("toolchain")
-                .help("The rust toolchain to use for building the rust code"),
-        )
-        .arg(
-            Arg::with_name("CPPCOMPILER")
-                .takes_value(true)
-                .default_value("clang++")
-                .long("cppcompiler")
-                .help("The C++ compiler to use for compiling the c++ code"),
-        )
-        .arg(
-            Arg::with_name("RUNNER")
-                .takes_value(true)
-                .long("runner")
-                .help("Run the C programs under emulation with this command"),
-        )
-        .arg(
-            Arg::with_name("SKIP")
-                .takes_value(true)
-                .long("skip")
-                .help("Filename for a list of intrinsics to skip (one per line)"),
-        )
-        .arg(
-            Arg::with_name("A32")
-                .takes_value(false)
-                .long("a32")
-                .help("Run tests for A32 instrinsics instead of A64"),
-        )
-        .get_matches();
-
-    let filename = matches.value_of("INPUT").unwrap();
-    let toolchain = matches
-        .value_of("TOOLCHAIN")
-        .map_or("".into(), |t| format!("+{t}"));
+    let args: Cli = clap::Parser::parse();
 
-    let cpp_compiler = matches.value_of("CPPCOMPILER").unwrap();
-    let c_runner = matches.value_of("RUNNER").unwrap_or("");
-    let skip = if let Some(filename) = matches.value_of("SKIP") {
+    let filename = args.input;
+    let toolchain = args.toolchain.map_or_else(String::new, |t| format!("+{t}"));
+    let cpp_compiler = args.cppcompiler;
+    let c_runner = args.runner.unwrap_or_else(String::new);
+    let skip = if let Some(filename) = args.skip {
         let data = std::fs::read_to_string(&filename).expect("Failed to open file");
         data.lines()
             .map(str::trim)
@@ -381,8 +370,8 @@ fn main() {
     } else {
         Default::default()
     };
-    let a32 = matches.is_present("A32");
-    let mut intrinsics = get_neon_intrinsics(filename).expect("Error parsing input file");
+    let a32 = args.a32;
+    let mut intrinsics = get_neon_intrinsics(&filename).expect("Error parsing input file");
 
     intrinsics.sort_by(|a, b| a.name.cmp(&b.name));
 
@@ -409,7 +398,7 @@ fn main() {
 
     let notices = build_notices("// ");
 
-    if !build_c(&notices, &intrinsics, cpp_compiler, a32) {
+    if !build_c(&notices, &intrinsics, &cpp_compiler, a32) {
         std::process::exit(2);
     }
 
diff --git a/library/stdarch/crates/simd-test-macro/Cargo.toml b/library/stdarch/crates/simd-test-macro/Cargo.toml
index cd110c1d3..c9e692d8e 100644
--- a/library/stdarch/crates/simd-test-macro/Cargo.toml
+++ b/library/stdarch/crates/simd-test-macro/Cargo.toml
@@ -11,3 +11,4 @@ test = false
 [dependencies]
 proc-macro2 = "1.0"
 quote = "1.0"
+syn = { version = "2.0", features = ["full"] }
diff --git a/library/stdarch/crates/simd-test-macro/src/lib.rs b/library/stdarch/crates/simd-test-macro/src/lib.rs
index 2a31dd745..9e089f86b 100644
--- a/library/stdarch/crates/simd-test-macro/src/lib.rs
+++ b/library/stdarch/crates/simd-test-macro/src/lib.rs
@@ -7,7 +7,7 @@
 #[macro_use]
 extern crate quote;
 
-use proc_macro2::{Delimiter, Ident, Literal, Span, TokenStream, TokenTree};
+use proc_macro2::{Ident, Literal, Span, TokenStream, TokenTree};
 use quote::ToTokens;
 use std::env;
 
@@ -44,13 +44,9 @@ pub fn simd_test(
         .collect();
 
     let enable_feature = string(enable_feature);
-    let item = TokenStream::from(item);
-    let name = find_name(item.clone());
-
-    let name: TokenStream = name
-        .to_string()
-        .parse()
-        .unwrap_or_else(|_| panic!("failed to parse name: {}", name.to_string()));
+    let mut item = syn::parse_macro_input!(item as syn::ItemFn);
+    let item_attrs = std::mem::take(&mut item.attrs);
+    let name = &item.sig.ident;
 
     let target = env::var("TARGET").expect(
         "TARGET environment variable should be set for rustc (e.g. TARGET=x86_64-apple-darwin cargo test)"
@@ -109,6 +105,7 @@ pub fn simd_test(
         #[allow(non_snake_case)]
         #[test]
         #maybe_ignore
+        #(#item_attrs)*
         fn #name() {
             if #force_test | (#cfg_target_features) {
                 let v = unsafe { #name() };
@@ -123,29 +120,3 @@ pub fn simd_test(
     };
     ret.into()
 }
-
-fn find_name(item: TokenStream) -> Ident {
-    let mut tokens = item.into_iter();
-    while let Some(tok) = tokens.next() {
-        if let TokenTree::Ident(word) = tok {
-            if word == "fn" {
-                break;
-            }
-        }
-    }
-
-    fn get_ident(tt: TokenTree) -> Option<Ident> {
-        match tt {
-            TokenTree::Ident(i) => Some(i),
-            TokenTree::Group(g) if g.delimiter() == Delimiter::None => {
-                get_ident(g.stream().into_iter().next()?)
-            }
-            _ => None,
-        }
-    }
-
-    tokens
-        .next()
-        .and_then(get_ident)
-        .expect("failed to find function name")
-}
diff --git a/library/stdarch/crates/std_detect/Cargo.toml b/library/stdarch/crates/std_detect/Cargo.toml
index 589a3900a..12d4a658c 100644
--- a/library/stdarch/crates/std_detect/Cargo.toml
+++ b/library/stdarch/crates/std_detect/Cargo.toml
@@ -30,7 +30,6 @@ compiler_builtins = { version = "0.1.2", optional = true }
 alloc = { version = "1.0.0", optional = true, package = "rustc-std-workspace-alloc" }
 
 [dev-dependencies]
-auxv = "0.3.3"
 cupid = "0.6.0"
 
 [features]
diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs
index 8bc0b30c3..ee46aa1ac 100644
--- a/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs
+++ b/library/stdarch/crates/std_detect/src/detect/os/linux/auxvec.rs
@@ -19,6 +19,7 @@ pub(crate) const AT_HWCAP2: usize = 26;
 /// If an entry cannot be read all the bits in the bitfield are set to zero.
 /// This should be interpreted as all the features being disabled.
 #[derive(Debug, Copy, Clone)]
+#[cfg_attr(test, derive(PartialEq))]
 pub(crate) struct AuxVec {
     pub hwcap: usize,
     #[cfg(any(
@@ -174,9 +175,12 @@ pub(crate) fn auxv() -> Result<AuxVec, ()> {
 /// Tries to read the `key` from the auxiliary vector by calling the
 /// dynamically-linked `getauxval` function. If the function is not linked,
 /// this function return `Err`.
-#[cfg(all(
-    feature = "std_detect_dlsym_getauxval",
-    not(all(target_os = "linux", target_env = "gnu"))
+#[cfg(any(
+    test,
+    all(
+        feature = "std_detect_dlsym_getauxval",
+        not(all(target_os = "linux", target_env = "gnu"))
+    )
 ))]
 fn getauxval(key: usize) -> Result<usize, ()> {
     use libc;
@@ -262,35 +266,8 @@ fn auxv_from_buf(buf: &[usize]) -> Result<AuxVec, ()> {
 
 #[cfg(test)]
 mod tests {
-    extern crate auxv as auxv_crate;
     use super::*;
 
-    // Reads the Auxiliary Vector key from /proc/self/auxv
-    // using the auxv crate.
-    #[cfg(feature = "std_detect_file_io")]
-    fn auxv_crate_getprocfs(key: usize) -> Option<usize> {
-        use self::auxv_crate::procfs::search_procfs_auxv;
-        use self::auxv_crate::AuxvType;
-        let k = key as AuxvType;
-        match search_procfs_auxv(&[k]) {
-            Ok(v) => Some(v[&k] as usize),
-            Err(_) => None,
-        }
-    }
-
-    // Reads the Auxiliary Vector key from getauxval()
-    // using the auxv crate.
-    #[cfg(not(any(target_arch = "mips", target_arch = "mips64")))]
-    fn auxv_crate_getauxval(key: usize) -> Option<usize> {
-        use self::auxv_crate::getauxval::Getauxval;
-        use self::auxv_crate::AuxvType;
-        let q = auxv_crate::getauxval::NativeGetauxval {};
-        match q.getauxval(key as AuxvType) {
-            Ok(v) => Some(v as usize),
-            Err(_) => None,
-        }
-    }
-
     // FIXME: on mips/mips64 getauxval returns 0, and /proc/self/auxv
     // does not always contain the AT_HWCAP key under qemu.
     #[cfg(any(
@@ -301,7 +278,7 @@ mod tests {
     #[test]
     fn auxv_crate() {
         let v = auxv();
-        if let Some(hwcap) = auxv_crate_getauxval(AT_HWCAP) {
+        if let Ok(hwcap) = getauxval(AT_HWCAP) {
             let rt_hwcap = v.expect("failed to find hwcap key").hwcap;
             assert_eq!(rt_hwcap, hwcap);
         }
@@ -314,7 +291,7 @@ mod tests {
             target_arch = "powerpc64"
         ))]
         {
-            if let Some(hwcap2) = auxv_crate_getauxval(AT_HWCAP2) {
+            if let Ok(hwcap2) = getauxval(AT_HWCAP2) {
                 let rt_hwcap2 = v.expect("failed to find hwcap2 key").hwcap2;
                 assert_eq!(rt_hwcap2, hwcap2);
             }
@@ -391,22 +368,8 @@ mod tests {
     #[test]
     #[cfg(feature = "std_detect_file_io")]
     fn auxv_crate_procfs() {
-        let v = auxv();
-        if let Some(hwcap) = auxv_crate_getprocfs(AT_HWCAP) {
-            assert_eq!(v.unwrap().hwcap, hwcap);
-        }
-
-        // Targets with AT_HWCAP and AT_HWCAP2:
-        #[cfg(any(
-            target_arch = "aarch64",
-            target_arch = "arm",
-            target_arch = "powerpc",
-            target_arch = "powerpc64"
-        ))]
-        {
-            if let Some(hwcap2) = auxv_crate_getprocfs(AT_HWCAP2) {
-                assert_eq!(v.unwrap().hwcap2, hwcap2);
-            }
+        if let Ok(procfs_auxv) = auxv_from_file("/proc/self/auxv") {
+            assert_eq!(auxv().unwrap(), procfs_auxv);
         }
     }
 }
diff --git a/library/stdarch/crates/std_detect/src/detect/os/x86.rs b/library/stdarch/crates/std_detect/src/detect/os/x86.rs
index d8afc1aca..d8dd84db4 100644
--- a/library/stdarch/crates/std_detect/src/detect/os/x86.rs
+++ b/library/stdarch/crates/std_detect/src/detect/os/x86.rs
@@ -49,11 +49,7 @@ pub(crate) fn detect_features() -> cache::Initializer {
             ecx,
             edx,
         } = __cpuid(0);
-        let vendor_id: [[u8; 4]; 3] = [
-            mem::transmute(ebx),
-            mem::transmute(edx),
-            mem::transmute(ecx),
-        ];
+        let vendor_id: [[u8; 4]; 3] = [ebx.to_ne_bytes(), edx.to_ne_bytes(), ecx.to_ne_bytes()];
         let vendor_id: [u8; 12] = mem::transmute(vendor_id);
         (max_basic_leaf, vendor_id)
     };
diff --git a/library/stdarch/crates/stdarch-test/Cargo.toml b/library/stdarch/crates/stdarch-test/Cargo.toml
index 3a2130d4e..3682fcd7e 100644
--- a/library/stdarch/crates/stdarch-test/Cargo.toml
+++ b/library/stdarch/crates/stdarch-test/Cargo.toml
@@ -20,7 +20,7 @@ cc = "1.0"
 # time, and we want to make updates to this explicit rather than automatically
 # picking up updates which might break CI with new instruction names.
 [target.'cfg(target_arch = "wasm32")'.dependencies]
-wasmprinter = "=0.2.53"
+wasmprinter = "=0.2.67"
 
 [features]
 default = []
diff --git a/library/stdarch/crates/stdarch-test/src/disassembly.rs b/library/stdarch/crates/stdarch-test/src/disassembly.rs
index 54df7261e..087fc46d4 100644
--- a/library/stdarch/crates/stdarch-test/src/disassembly.rs
+++ b/library/stdarch/crates/stdarch-test/src/disassembly.rs
@@ -81,6 +81,8 @@ pub(crate) fn disassemble_myself() -> HashSet<Function> {
     let add_args = if cfg!(target_os = "macos") && cfg!(target_arch = "aarch64") {
         // Target features need to be enabled for LLVM objdump on Macos ARM64
         vec!["--mattr=+v8.6a,+crypto,+tme"]
+    } else if cfg!(target_arch = "riscv64") {
+        vec!["--mattr=+zk,+zks,+zbc,+zbb"]
     } else {
         vec![]
     };
diff --git a/library/stdarch/crates/stdarch-verify/Cargo.toml b/library/stdarch/crates/stdarch-verify/Cargo.toml
index 10ae90074..515f05138 100644
--- a/library/stdarch/crates/stdarch-verify/Cargo.toml
+++ b/library/stdarch/crates/stdarch-verify/Cargo.toml
@@ -7,7 +7,7 @@ edition = "2021"
 [dependencies]
 proc-macro2 = "1.0"
 quote = "1.0"
-syn = { version = "1.0", features = ["full"] }
+syn = { version = "2.0", features = ["full"] }
 
 [lib]
 proc-macro = true
@@ -15,5 +15,5 @@ test = false
 
 [dev-dependencies]
 serde = { version = "1.0", features = ['derive'] }
-serde-xml-rs = "0.3"
+serde-xml-rs = "0.6"
 serde_json = "1.0.96"
diff --git a/library/stdarch/crates/stdarch-verify/src/lib.rs b/library/stdarch/crates/stdarch-verify/src/lib.rs
index a9bf89f70..3f9eb3bf9 100644
--- a/library/stdarch/crates/stdarch-verify/src/lib.rs
+++ b/library/stdarch/crates/stdarch-verify/src/lib.rs
@@ -7,6 +7,7 @@ extern crate syn;
 use proc_macro::TokenStream;
 use std::{fs::File, io::Read, path::Path};
 use syn::ext::IdentExt;
+use syn::parse::Parser as _;
 
 #[proc_macro]
 pub fn x86_functions(input: TokenStream) -> TokenStream {
@@ -416,7 +417,7 @@ fn walk(root: &Path, files: &mut Vec<(syn::File, String)>) {
 
 fn find_instrs(attrs: &[syn::Attribute]) -> Vec<String> {
     struct AssertInstr {
-        instr: String,
+        instr: Option<String>,
     }
 
     // A small custom parser to parse out the instruction in `assert_instr`.
@@ -424,15 +425,21 @@ fn find_instrs(attrs: &[syn::Attribute]) -> Vec<String> {
     // TODO: should probably just reuse `Invoc` from the `assert-instr-macro`
     // crate.
     impl syn::parse::Parse for AssertInstr {
-        fn parse(content: syn::parse::ParseStream<'_>) -> syn::Result<Self> {
-            let input;
-            parenthesized!(input in content);
-            let _ = input.parse::<syn::Meta>()?;
-            let _ = input.parse::<Token![,]>()?;
-            let ident = input.parse::<syn::Ident>()?;
-            if ident != "assert_instr" {
-                return Err(input.error("expected `assert_instr`"));
+        fn parse(input: syn::parse::ParseStream<'_>) -> syn::Result<Self> {
+            let _ = input.parse::<syn::Meta>().unwrap();
+            let _ = input.parse::<Token![,]>().unwrap();
+
+            match input.parse::<syn::Ident>() {
+                Ok(ident) if ident == "assert_instr" => {}
+                _ => {
+                    while !input.is_empty() {
+                        // consume everything
+                        drop(input.parse::<proc_macro2::TokenStream>());
+                    }
+                    return Ok(Self { instr: None });
+                }
             }
+
             let instrs;
             parenthesized!(instrs in input);
 
@@ -452,18 +459,24 @@ fn find_instrs(attrs: &[syn::Attribute]) -> Vec<String> {
                     return Err(input.error("failed to parse instruction"));
                 }
             }
-            Ok(Self { instr })
+            Ok(Self { instr: Some(instr) })
         }
     }
 
     attrs
         .iter()
-        .filter(|a| a.path.is_ident("cfg_attr"))
         .filter_map(|a| {
-            syn::parse2::<AssertInstr>(a.tokens.clone())
-                .ok()
-                .map(|a| a.instr)
+            if let syn::Meta::List(ref l) = a.meta {
+                if l.path.is_ident("cfg_attr") {
+                    Some(l)
+                } else {
+                    None
+                }
+            } else {
+                None
+            }
         })
+        .filter_map(|l| syn::parse2::<AssertInstr>(l.tokens.clone()).unwrap().instr)
         .collect()
 }
 
@@ -471,19 +484,26 @@ fn find_target_feature(attrs: &[syn::Attribute]) -> Option<syn::Lit> {
     attrs
         .iter()
         .flat_map(|a| {
-            if let Ok(syn::Meta::List(i)) = a.parse_meta() {
-                if i.path.is_ident("target_feature") {
-                    return i.nested;
+            if let syn::Meta::List(ref l) = a.meta {
+                if l.path.is_ident("target_feature") {
+                    if let Ok(l) =
+                        syn::punctuated::Punctuated::<syn::Meta, Token![,]>::parse_terminated
+                            .parse2(l.tokens.clone())
+                    {
+                        return l;
+                    }
                 }
             }
             syn::punctuated::Punctuated::new()
         })
-        .filter_map(|nested| match nested {
-            syn::NestedMeta::Meta(m) => Some(m),
-            syn::NestedMeta::Lit(_) => None,
-        })
         .find_map(|m| match m {
-            syn::Meta::NameValue(ref i) if i.path.is_ident("enable") => Some(i.clone().lit),
+            syn::Meta::NameValue(i) if i.path.is_ident("enable") => {
+                if let syn::Expr::Lit(lit) = i.value {
+                    Some(lit.lit)
+                } else {
+                    None
+                }
+            }
             _ => None,
         })
 }
@@ -491,9 +511,16 @@ fn find_target_feature(attrs: &[syn::Attribute]) -> Option<syn::Lit> {
 fn find_required_const(name: &str, attrs: &[syn::Attribute]) -> Vec<usize> {
     attrs
         .iter()
-        .flat_map(|a| {
-            if a.path.segments[0].ident == name {
-                syn::parse::<RustcArgsRequiredConst>(a.tokens.clone().into())
+        .filter_map(|a| {
+            if let syn::Meta::List(ref l) = a.meta {
+                Some(l)
+            } else {
+                None
+            }
+        })
+        .flat_map(|l| {
+            if l.path.segments[0].ident == name {
+                syn::parse2::<RustcArgsRequiredConst>(l.tokens.clone())
                     .unwrap()
                     .args
             } else {
@@ -509,10 +536,7 @@ struct RustcArgsRequiredConst {
 
 impl syn::parse::Parse for RustcArgsRequiredConst {
     fn parse(input: syn::parse::ParseStream<'_>) -> syn::Result<Self> {
-        let content;
-        parenthesized!(content in input);
-        let list =
-            syn::punctuated::Punctuated::<syn::LitInt, Token![,]>::parse_terminated(&content)?;
+        let list = syn::punctuated::Punctuated::<syn::LitInt, Token![,]>::parse_terminated(&input)?;
         Ok(Self {
             args: list
                 .into_iter()
-- 
cgit v1.2.3