Merging upstream version 1.75.0+dfsg1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-06-07 05:48:48 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-06-07 05:48:48 +0000
commit: ef24de24a82fe681581cc130f342363c47c0969a (patch)
tree: 0d494f7e1a38b95c92426f58fe6eaa877303a86c /library/stdarch/crates/core_arch/src/x86
parent: Releasing progress-linux version 1.74.1+dfsg1-1~progress7.99u1. (diff)
download: rustc-ef24de24a82fe681581cc130f342363c47c0969a.tar.xz
rustc-ef24de24a82fe681581cc130f342363c47c0969a.zip
10 files changed, 621 insertions, 508 deletions
diff --git a/library/stdarch/crates/core_arch/src/x86/avx.rs b/library/stdarch/crates/core_arch/src/x86/avx.rs
index 00bcc1fa1..de5dc05b8 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx.rs
@@ -268,7 +268,11 @@ pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vaddsubpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
-    addsubpd256(a, b)
+    let a = a.as_f64x4();
+    let b = b.as_f64x4();
+    let add = simd_add(a, b);
+    let sub = simd_sub(a, b);
+    simd_shuffle!(add, sub, [4, 1, 6, 3])
 }
 
 /// Alternatively adds and subtracts packed single-precision (32-bit)
@@ -280,7 +284,11 @@ pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
 #[cfg_attr(test, assert_instr(vaddsubps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
-    addsubps256(a, b)
+    let a = a.as_f32x8();
+    let b = b.as_f32x8();
+    let add = simd_add(a, b);
+    let sub = simd_sub(a, b);
+    simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
 }
 
 /// Subtracts packed double-precision (64-bit) floating-point elements in `b`
@@ -511,7 +519,8 @@ pub unsafe fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vblendvpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    vblendvpd(a, b, c)
+    let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::splat(0));
+    transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
 }
 
 /// Blends packed single-precision (32-bit) floating-point elements from
@@ -523,7 +532,8 @@ pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
 #[cfg_attr(test, assert_instr(vblendvps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
-    vblendvps(a, b, c)
+    let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::splat(0));
+    transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
 }
 
 /// Conditionally multiplies the packed single-precision (32-bit) floating-point
@@ -2056,7 +2066,10 @@ pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
 #[cfg_attr(test, assert_instr(vmovmskpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
-    movmskpd256(a)
+    // Propagate the highest bit to the rest, because simd_bitmask
+    // requires all-1 or all-0.
+    let mask: i64x4 = simd_lt(transmute(a), i64x4::splat(0));
+    simd_bitmask::<i64x4, u8>(mask).into()
 }
 
 /// Sets each bit of the returned mask based on the most significant bit of the
@@ -2069,7 +2082,10 @@ pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
 #[cfg_attr(test, assert_instr(vmovmskps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
-    movmskps256(a)
+    // Propagate the highest bit to the rest, because simd_bitmask
+    // requires all-1 or all-0.
+    let mask: i32x8 = simd_lt(transmute(a), i32x8::splat(0));
+    simd_bitmask::<i32x8, u8>(mask).into()
 }
 
 /// Returns vector of type __m256d with all elements set to zero.
@@ -2904,20 +2920,12 @@ pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 {
 // LLVM intrinsics used in the above functions
 #[allow(improper_ctypes)]
 extern "C" {
-    #[link_name = "llvm.x86.avx.addsub.pd.256"]
-    fn addsubpd256(a: __m256d, b: __m256d) -> __m256d;
-    #[link_name = "llvm.x86.avx.addsub.ps.256"]
-    fn addsubps256(a: __m256, b: __m256) -> __m256;
     #[link_name = "llvm.x86.avx.round.pd.256"]
     fn roundpd256(a: __m256d, b: i32) -> __m256d;
     #[link_name = "llvm.x86.avx.round.ps.256"]
     fn roundps256(a: __m256, b: i32) -> __m256;
     #[link_name = "llvm.x86.avx.sqrt.ps.256"]
     fn sqrtps256(a: __m256) -> __m256;
-    #[link_name = "llvm.x86.avx.blendv.pd.256"]
-    fn vblendvpd(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
-    #[link_name = "llvm.x86.avx.blendv.ps.256"]
-    fn vblendvps(a: __m256, b: __m256, c: __m256) -> __m256;
     #[link_name = "llvm.x86.avx.dp.ps.256"]
     fn vdpps(a: __m256, b: __m256, imm8: i32) -> __m256;
     #[link_name = "llvm.x86.avx.hadd.pd.256"]
@@ -3026,10 +3034,6 @@ extern "C" {
     fn vtestcps(a: __m128, b: __m128) -> i32;
     #[link_name = "llvm.x86.avx.vtestnzc.ps"]
     fn vtestnzcps(a: __m128, b: __m128) -> i32;
-    #[link_name = "llvm.x86.avx.movmsk.pd.256"]
-    fn movmskpd256(a: __m256d) -> i32;
-    #[link_name = "llvm.x86.avx.movmsk.ps.256"]
-    fn movmskps256(a: __m256) -> i32;
     #[link_name = "llvm.x86.avx.min.ps.256"]
     fn vminps(a: __m256, b: __m256) -> __m256;
     #[link_name = "llvm.x86.avx.max.ps.256"]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs
index e23c795ee..243a4cdab 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs
@@ -344,7 +344,10 @@ pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpavgw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pavgw(a.as_u16x16(), b.as_u16x16()))
+    let a = simd_cast::<_, u32x16>(a.as_u16x16());
+    let b = simd_cast::<_, u32x16>(b.as_u16x16());
+    let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
+    transmute(simd_cast::<_, u16x16>(r))
 }
 
 /// Averages packed unsigned 8-bit integers in `a` and `b`.
@@ -355,7 +358,10 @@ pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpavgb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pavgb(a.as_u8x32(), b.as_u8x32()))
+    let a = simd_cast::<_, u16x32>(a.as_u8x32());
+    let b = simd_cast::<_, u16x32>(b.as_u8x32());
+    let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
+    transmute(simd_cast::<_, u8x32>(r))
 }
 
 /// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
@@ -458,7 +464,8 @@ pub unsafe fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
 #[cfg_attr(test, assert_instr(vpblendvb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
-    transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32()))
+    let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::splat(0));
+    transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
 }
 
 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
@@ -2060,7 +2067,9 @@ pub unsafe fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __
 #[cfg_attr(test, assert_instr(vpmuldq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pmuldq(a.as_i32x8(), b.as_i32x8()))
+    let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4()));
+    let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4()));
+    transmute(simd_mul(a, b))
 }
 
 /// Multiplies the low unsigned 32-bit integers from each packed 64-bit
@@ -2074,7 +2083,10 @@ pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpmuludq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pmuludq(a.as_u32x8(), b.as_u32x8()))
+    let a = a.as_u64x4();
+    let b = b.as_u64x4();
+    let mask = u64x4::splat(u32::MAX.into());
+    transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
 }
 
 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
@@ -2087,7 +2099,10 @@ pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpmulhw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pmulhw(a.as_i16x16(), b.as_i16x16()))
+    let a = simd_cast::<_, i32x16>(a.as_i16x16());
+    let b = simd_cast::<_, i32x16>(b.as_i16x16());
+    let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
+    transmute(simd_cast::<i32x16, i16x16>(r))
 }
 
 /// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
@@ -2100,7 +2115,10 @@ pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpmulhuw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
-    transmute(pmulhuw(a.as_u16x16(), b.as_u16x16()))
+    let a = simd_cast::<_, u32x16>(a.as_u16x16());
+    let b = simd_cast::<_, u32x16>(b.as_u16x16());
+    let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
+    transmute(simd_cast::<u32x16, u16x16>(r))
 }
 
 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
@@ -3629,12 +3647,6 @@ extern "C" {
     fn pabsw(a: i16x16) -> u16x16;
     #[link_name = "llvm.x86.avx2.pabs.d"]
     fn pabsd(a: i32x8) -> u32x8;
-    #[link_name = "llvm.x86.avx2.pavg.b"]
-    fn pavgb(a: u8x32, b: u8x32) -> u8x32;
-    #[link_name = "llvm.x86.avx2.pavg.w"]
-    fn pavgw(a: u16x16, b: u16x16) -> u16x16;
-    #[link_name = "llvm.x86.avx2.pblendvb"]
-    fn pblendvb(a: i8x32, b: i8x32, mask: i8x32) -> i8x32;
     #[link_name = "llvm.x86.avx2.phadd.w"]
     fn phaddw(a: i16x16, b: i16x16) -> i16x16;
     #[link_name = "llvm.x86.avx2.phadd.d"]
@@ -3669,14 +3681,6 @@ extern "C" {
     fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
     #[link_name = "llvm.x86.avx2.mpsadbw"]
     fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
-    #[link_name = "llvm.x86.avx2.pmulhu.w"]
-    fn pmulhuw(a: u16x16, b: u16x16) -> u16x16;
-    #[link_name = "llvm.x86.avx2.pmulh.w"]
-    fn pmulhw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.pmul.dq"]
-    fn pmuldq(a: i32x8, b: i32x8) -> i64x4;
-    #[link_name = "llvm.x86.avx2.pmulu.dq"]
-    fn pmuludq(a: u32x8, b: u32x8) -> u64x4;
     #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
     fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
     #[link_name = "llvm.x86.avx2.packsswb"]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs b/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
index 92e572eb1..ce4e402a8 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
@@ -311,7 +311,7 @@ pub unsafe fn _mm_mask_popcnt_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __
 #[target_feature(enable = "avx512bitalg")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
 pub unsafe fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64 {
-    transmute(bitshuffle_512(b.as_i8x64(), c.as_i8x64(), !0))
+    bitshuffle_512(b.as_i8x64(), c.as_i8x64(), !0)
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -326,7 +326,7 @@ pub unsafe fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64
 #[target_feature(enable = "avx512bitalg")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
 pub unsafe fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m512i) -> __mmask64 {
-    transmute(bitshuffle_512(b.as_i8x64(), c.as_i8x64(), k))
+    bitshuffle_512(b.as_i8x64(), c.as_i8x64(), k)
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -338,7 +338,7 @@ pub unsafe fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
 pub unsafe fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32 {
-    transmute(bitshuffle_256(b.as_i8x32(), c.as_i8x32(), !0))
+    bitshuffle_256(b.as_i8x32(), c.as_i8x32(), !0)
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -353,7 +353,7 @@ pub unsafe fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
 pub unsafe fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m256i) -> __mmask32 {
-    transmute(bitshuffle_256(b.as_i8x32(), c.as_i8x32(), k))
+    bitshuffle_256(b.as_i8x32(), c.as_i8x32(), k)
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -365,7 +365,7 @@ pub unsafe fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
 pub unsafe fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
-    transmute(bitshuffle_128(b.as_i8x16(), c.as_i8x16(), !0))
+    bitshuffle_128(b.as_i8x16(), c.as_i8x16(), !0)
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -380,7 +380,7 @@ pub unsafe fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
 pub unsafe fn _mm_mask_bitshuffle_epi64_mask(k: __mmask16, b: __m128i, c: __m128i) -> __mmask16 {
-    transmute(bitshuffle_128(b.as_i8x16(), c.as_i8x16(), k))
+    bitshuffle_128(b.as_i8x16(), c.as_i8x16(), k)
 }
 
 #[cfg(test)]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
index 364023539..0b4a56d36 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
@@ -3703,8 +3703,7 @@ pub unsafe fn _mm512_cmp_epu16_mask<const IMM8: i32>(a: __m512i, b: __m512i) ->
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u16x32();
     let b = b.as_u16x32();
-    let r = vpcmpuw(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
-    transmute(r)
+    vpcmpuw(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3722,8 +3721,7 @@ pub unsafe fn _mm512_mask_cmp_epu16_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u16x32();
     let b = b.as_u16x32();
-    let r = vpcmpuw(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpuw(a, b, IMM8, k1)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3737,8 +3735,7 @@ pub unsafe fn _mm256_cmp_epu16_mask<const IMM8: i32>(a: __m256i, b: __m256i) ->
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u16x16();
     let b = b.as_u16x16();
-    let r = vpcmpuw256(a, b, IMM8, 0b11111111_11111111);
-    transmute(r)
+    vpcmpuw256(a, b, IMM8, 0b11111111_11111111)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3756,8 +3753,7 @@ pub unsafe fn _mm256_mask_cmp_epu16_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u16x16();
     let b = b.as_u16x16();
-    let r = vpcmpuw256(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpuw256(a, b, IMM8, k1)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3771,8 +3767,7 @@ pub unsafe fn _mm_cmp_epu16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __m
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u16x8();
     let b = b.as_u16x8();
-    let r = vpcmpuw128(a, b, IMM8, 0b11111111);
-    transmute(r)
+    vpcmpuw128(a, b, IMM8, 0b11111111)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3790,8 +3785,7 @@ pub unsafe fn _mm_mask_cmp_epu16_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u16x8();
     let b = b.as_u16x8();
-    let r = vpcmpuw128(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpuw128(a, b, IMM8, k1)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3805,13 +3799,12 @@ pub unsafe fn _mm512_cmp_epu8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> _
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u8x64();
     let b = b.as_u8x64();
-    let r = vpcmpub(
+    vpcmpub(
         a,
         b,
         IMM8,
         0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-    );
-    transmute(r)
+    )
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3829,8 +3822,7 @@ pub unsafe fn _mm512_mask_cmp_epu8_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u8x64();
     let b = b.as_u8x64();
-    let r = vpcmpub(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpub(a, b, IMM8, k1)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3844,8 +3836,7 @@ pub unsafe fn _mm256_cmp_epu8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> _
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u8x32();
     let b = b.as_u8x32();
-    let r = vpcmpub256(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
-    transmute(r)
+    vpcmpub256(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3863,8 +3854,7 @@ pub unsafe fn _mm256_mask_cmp_epu8_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u8x32();
     let b = b.as_u8x32();
-    let r = vpcmpub256(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpub256(a, b, IMM8, k1)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3878,8 +3868,7 @@ pub unsafe fn _mm_cmp_epu8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mm
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u8x16();
     let b = b.as_u8x16();
-    let r = vpcmpub128(a, b, IMM8, 0b11111111_11111111);
-    transmute(r)
+    vpcmpub128(a, b, IMM8, 0b11111111_11111111)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3897,8 +3886,7 @@ pub unsafe fn _mm_mask_cmp_epu8_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u8x16();
     let b = b.as_u8x16();
-    let r = vpcmpub128(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpub128(a, b, IMM8, k1)
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3912,8 +3900,7 @@ pub unsafe fn _mm512_cmp_epi16_mask<const IMM8: i32>(a: __m512i, b: __m512i) ->
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i16x32();
     let b = b.as_i16x32();
-    let r = vpcmpw(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
-    transmute(r)
+    vpcmpw(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3931,8 +3918,7 @@ pub unsafe fn _mm512_mask_cmp_epi16_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i16x32();
     let b = b.as_i16x32();
-    let r = vpcmpw(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpw(a, b, IMM8, k1)
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3946,8 +3932,7 @@ pub unsafe fn _mm256_cmp_epi16_mask<const IMM8: i32>(a: __m256i, b: __m256i) ->
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i16x16();
     let b = b.as_i16x16();
-    let r = vpcmpw256(a, b, IMM8, 0b11111111_11111111);
-    transmute(r)
+    vpcmpw256(a, b, IMM8, 0b11111111_11111111)
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3965,8 +3950,7 @@ pub unsafe fn _mm256_mask_cmp_epi16_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i16x16();
     let b = b.as_i16x16();
-    let r = vpcmpw256(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpw256(a, b, IMM8, k1)
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3980,8 +3964,7 @@ pub unsafe fn _mm_cmp_epi16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __m
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i16x8();
     let b = b.as_i16x8();
-    let r = vpcmpw128(a, b, IMM8, 0b11111111);
-    transmute(r)
+    vpcmpw128(a, b, IMM8, 0b11111111)
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3999,8 +3982,7 @@ pub unsafe fn _mm_mask_cmp_epi16_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i16x8();
     let b = b.as_i16x8();
-    let r = vpcmpw128(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpw128(a, b, IMM8, k1)
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4014,13 +3996,12 @@ pub unsafe fn _mm512_cmp_epi8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> _
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i8x64();
     let b = b.as_i8x64();
-    let r = vpcmpb(
+    vpcmpb(
         a,
         b,
         IMM8,
         0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-    );
-    transmute(r)
+    )
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4038,8 +4019,7 @@ pub unsafe fn _mm512_mask_cmp_epi8_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i8x64();
     let b = b.as_i8x64();
-    let r = vpcmpb(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpb(a, b, IMM8, k1)
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4053,8 +4033,7 @@ pub unsafe fn _mm256_cmp_epi8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> _
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i8x32();
     let b = b.as_i8x32();
-    let r = vpcmpb256(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
-    transmute(r)
+    vpcmpb256(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4072,8 +4051,7 @@ pub unsafe fn _mm256_mask_cmp_epi8_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i8x32();
     let b = b.as_i8x32();
-    let r = vpcmpb256(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpb256(a, b, IMM8, k1)
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4087,8 +4065,7 @@ pub unsafe fn _mm_cmp_epi8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mm
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i8x16();
     let b = b.as_i8x16();
-    let r = vpcmpb128(a, b, IMM8, 0b11111111_11111111);
-    transmute(r)
+    vpcmpb128(a, b, IMM8, 0b11111111_11111111)
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4106,8 +4083,7 @@ pub unsafe fn _mm_mask_cmp_epi8_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i8x16();
     let b = b.as_i8x16();
-    let r = vpcmpb128(a, b, IMM8, k1);
-    transmute(r)
+    vpcmpb128(a, b, IMM8, k1)
 }
 
 /// Load 512-bits (composed of 32 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
@@ -8566,7 +8542,7 @@ pub unsafe fn _mm_movm_epi8(k: __mmask16) -> __m128i {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    transmute(a + b)
+    a + b
 }
 
 /// Add 64-bit masks in a and b, and store the result in k.
@@ -8575,7 +8551,7 @@ pub unsafe fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    transmute(a + b)
+    a + b
 }
 
 /// Compute the bitwise AND of 32-bit masks a and b, and store the result in k.
@@ -8584,7 +8560,7 @@ pub unsafe fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    transmute(a & b)
+    a & b
 }
 
 /// Compute the bitwise AND of 64-bit masks a and b, and store the result in k.
@@ -8593,7 +8569,7 @@ pub unsafe fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    transmute(a & b)
+    a & b
 }
 
 /// Compute the bitwise NOT of 32-bit mask a, and store the result in k.
@@ -8602,7 +8578,7 @@ pub unsafe fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _knot_mask32(a: __mmask32) -> __mmask32 {
-    transmute(a ^ 0b11111111_11111111_11111111_11111111)
+    a ^ 0b11111111_11111111_11111111_11111111
 }
 
 /// Compute the bitwise NOT of 64-bit mask a, and store the result in k.
@@ -8611,7 +8587,7 @@ pub unsafe fn _knot_mask32(a: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _knot_mask64(a: __mmask64) -> __mmask64 {
-    transmute(a ^ 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111)
+    a ^ 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
 }
 
 /// Compute the bitwise NOT of 32-bit masks a and then AND with b, and store the result in k.
@@ -8620,7 +8596,7 @@ pub unsafe fn _knot_mask64(a: __mmask64) -> __mmask64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    transmute(_knot_mask32(a) & b)
+    _knot_mask32(a) & b
 }
 
 /// Compute the bitwise NOT of 64-bit masks a and then AND with b, and store the result in k.
@@ -8629,7 +8605,7 @@ pub unsafe fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    transmute(_knot_mask64(a) & b)
+    _knot_mask64(a) & b
 }
 
 /// Compute the bitwise OR of 32-bit masks a and b, and store the result in k.
@@ -8638,7 +8614,7 @@ pub unsafe fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    transmute(a | b)
+    a | b
 }
 
 /// Compute the bitwise OR of 64-bit masks a and b, and store the result in k.
@@ -8647,7 +8623,7 @@ pub unsafe fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    transmute(a | b)
+    a | b
 }
 
 /// Compute the bitwise XOR of 32-bit masks a and b, and store the result in k.
@@ -8656,7 +8632,7 @@ pub unsafe fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    transmute(a ^ b)
+    a ^ b
 }
 
 /// Compute the bitwise XOR of 64-bit masks a and b, and store the result in k.
@@ -8665,7 +8641,7 @@ pub unsafe fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    transmute(a ^ b)
+    a ^ b
 }
 
 /// Compute the bitwise XNOR of 32-bit masks a and b, and store the result in k.
@@ -8674,7 +8650,7 @@ pub unsafe fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    transmute(_knot_mask32(a ^ b))
+    _knot_mask32(a ^ b)
 }
 
 /// Compute the bitwise XNOR of 64-bit masks a and b, and store the result in k.
@@ -8683,7 +8659,7 @@ pub unsafe fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 pub unsafe fn _kxnor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    transmute(_knot_mask64(a ^ b))
+    _knot_mask64(a ^ b)
 }
 
 /// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
index 5412237ca..280135292 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
@@ -17144,7 +17144,7 @@ pub unsafe fn _mm512_slli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
     if IMM8 >= 32 {
         _mm512_setzero_si512()
     } else {
-        transmute(simd_shl(a.as_u32x16(), u32x16::splat(IMM8 as u32)))
+        transmute(simd_shl(a.as_u32x16(), u32x16::splat(IMM8)))
     }
 }
 
@@ -20132,7 +20132,7 @@ pub unsafe fn _mm512_maskz_permutexvar_epi32(k: __mmask16, idx: __m512i, a: __m5
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermd
 pub unsafe fn _mm256_permutexvar_epi32(idx: __m256i, a: __m256i) -> __m256i {
-    transmute(_mm256_permutevar8x32_epi32(a, idx)) // llvm use llvm.x86.avx2.permd
+    _mm256_permutevar8x32_epi32(a, idx) // llvm use llvm.x86.avx2.permd
 }
 
 /// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20284,7 +20284,7 @@ pub unsafe fn _mm512_maskz_permutexvar_ps(k: __mmask16, idx: __m512i, a: __m512)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[cfg_attr(test, assert_instr(vpermps))]
 pub unsafe fn _mm256_permutexvar_ps(idx: __m256i, a: __m256) -> __m256 {
-    transmute(_mm256_permutevar8x32_ps(a, idx)) //llvm.x86.avx2.permps
+    _mm256_permutevar8x32_ps(a, idx) //llvm.x86.avx2.permps
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23943,7 +23943,7 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
 #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(vmovd))]
 pub unsafe fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
     let extract: i32 = simd_extract(a.as_i32x16(), 0);
-    transmute(extract)
+    extract
 }
 
 /// Broadcast the low packed 32-bit integer from a to all elements of dst.
@@ -25744,7 +25744,7 @@ pub unsafe fn _mm512_andnot_si512(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
 pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
-    transmute(a & b)
+    a & b
 }
 
 /// Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
@@ -25754,7 +25754,7 @@ pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
 pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 {
-    transmute(a & b)
+    a & b
 }
 
 /// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
@@ -25764,7 +25764,7 @@ pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
 pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
-    transmute(a | b)
+    a | b
 }
 
 /// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
@@ -25774,7 +25774,7 @@ pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
 pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 {
-    transmute(a | b)
+    a | b
 }
 
 /// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
@@ -25784,7 +25784,7 @@ pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
 pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
-    transmute(a ^ b)
+    a ^ b
 }
 
 /// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
@@ -25794,7 +25794,7 @@ pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
 pub unsafe fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 {
-    transmute(a ^ b)
+    a ^ b
 }
 
 /// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
@@ -25803,7 +25803,7 @@ pub unsafe fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _knot_mask16(a: __mmask16) -> __mmask16 {
-    transmute(a ^ 0b11111111_11111111)
+    a ^ 0b11111111_11111111
 }
 
 /// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
@@ -25812,7 +25812,7 @@ pub unsafe fn _knot_mask16(a: __mmask16) -> __mmask16 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_knot(a: __mmask16) -> __mmask16 {
-    transmute(a ^ 0b11111111_11111111)
+    a ^ 0b11111111_11111111
 }
 
 /// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k.
@@ -25862,8 +25862,7 @@ pub unsafe fn _mm512_kxnor(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
 pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 {
-    let r: u16 = a;
-    transmute(r)
+    a
 }
 
 /// Converts integer mask into bitmask, storing the result in dst.
@@ -25872,8 +25871,7 @@ pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 {
 #[inline]
 #[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
 pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
-    let r: u16 = mask as u16;
-    transmute(r)
+    mask as u16
 }
 
 /// Converts bit mask k1 into an integer value, storing the results in dst.
@@ -25883,8 +25881,7 @@ pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
 pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
-    let r: i32 = k1 as i32;
-    transmute(r)
+    k1 as i32
 }
 
 /// Unpack and interleave 8 bits from masks a and b, and store the 16-bit result in k.
@@ -25896,7 +25893,7 @@ pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
 pub unsafe fn _mm512_kunpackb(a: __mmask16, b: __mmask16) -> __mmask16 {
     let a = a & 0b00000000_11111111;
     let b = b & 0b11111111_00000000;
-    transmute(a | b)
+    a | b
 }
 
 /// Performs bitwise OR between k1 and k2, storing the result in dst. CF flag is set if dst consists of all 1's.
@@ -32352,8 +32349,7 @@ pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -
     if (k & 0b00000001) != 0 {
         mov = simd_extract(b, 0);
     }
-    let r = simd_insert(a, 0, mov);
-    transmute(r)
+    simd_insert(a, 0, mov)
 }
 
 /// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32367,8 +32363,7 @@ pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
     if (k & 0b00000001) != 0 {
         mov = simd_extract(b, 0);
     }
-    let r = simd_insert(a, 0, mov);
-    transmute(r)
+    simd_insert(a, 0, mov)
 }
 
 /// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32383,8 +32378,7 @@ pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d
     if (k & 0b00000001) != 0 {
         mov = simd_extract(b, 0);
     }
-    let r = simd_insert(a, 0, mov);
-    transmute(r)
+    simd_insert(a, 0, mov)
 }
 
 /// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32398,8 +32392,7 @@ pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d
     if (k & 0b00000001) != 0 {
         mov = simd_extract(b, 0);
     }
-    let r = simd_insert(a, 0, mov);
-    transmute(r)
+    simd_insert(a, 0, mov)
 }
 
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32416,8 +32409,7 @@ pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
         let extractb: f32 = simd_extract(b, 0);
         add = extracta + extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32433,8 +32425,7 @@ pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
         let extractb: f32 = simd_extract(b, 0);
         add = extracta + extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32451,8 +32442,7 @@ pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
         let extractb: f64 = simd_extract(b, 0);
         add = extracta + extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32468,8 +32458,7 @@ pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
         let extractb: f64 = simd_extract(b, 0);
         add = extracta + extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32486,8 +32475,7 @@ pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
         let extractb: f32 = simd_extract(b, 0);
         add = extracta - extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32503,8 +32491,7 @@ pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
         let extractb: f32 = simd_extract(b, 0);
         add = extracta - extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32521,8 +32508,7 @@ pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
         let extractb: f64 = simd_extract(b, 0);
         add = extracta - extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32538,8 +32524,7 @@ pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
         let extractb: f64 = simd_extract(b, 0);
         add = extracta - extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32556,8 +32541,7 @@ pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
         let extractb: f32 = simd_extract(b, 0);
         add = extracta * extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32573,8 +32557,7 @@ pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
         let extractb: f32 = simd_extract(b, 0);
         add = extracta * extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32591,8 +32574,7 @@ pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
         let extractb: f64 = simd_extract(b, 0);
         add = extracta * extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32608,8 +32590,7 @@ pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
         let extractb: f64 = simd_extract(b, 0);
         add = extracta * extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32626,8 +32607,7 @@ pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
         let extractb: f32 = simd_extract(b, 0);
         add = extracta / extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32643,8 +32623,7 @@ pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
         let extractb: f32 = simd_extract(b, 0);
         add = extracta / extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32661,8 +32640,7 @@ pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
         let extractb: f64 = simd_extract(b, 0);
         add = extracta / extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32678,8 +32656,7 @@ pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
         let extractb: f64 = simd_extract(b, 0);
         add = extracta / extractb;
     }
-    let r = simd_insert(a, 0, add);
-    transmute(r)
+    simd_insert(a, 0, add)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33587,8 +33564,7 @@ pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) ->
         let extractc: f32 = simd_extract(c, 0);
         fmadd = vfmadd132ss(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+    simd_insert(a, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33605,8 +33581,7 @@ pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -
         let extractc: f32 = simd_extract(c, 0);
         fmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+    simd_insert(a, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -33622,8 +33597,7 @@ pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -
         let extractb: f32 = simd_extract(b, 0);
         fmadd = vfmadd132ss(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(c, 0, fmadd);
-    transmute(r)
+    simd_insert(c, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33639,8 +33613,7 @@ pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d)
         let extractc: f64 = simd_extract(c, 0);
         fmadd = vfmadd132sd(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+    simd_insert(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33657,8 +33630,7 @@ pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
         let extractc: f64 = simd_extract(c, 0);
         fmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+    simd_insert(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -33674,8 +33646,7 @@ pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
         let extractb: f64 = simd_extract(b, 0);
         fmadd = vfmadd132sd(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(c, 0, fmadd);
-    transmute(r)
+    simd_insert(c, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33692,8 +33663,7 @@ pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) ->
         let extractc = -extractc;
         fmsub = vfmadd132ss(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33711,8 +33681,7 @@ pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -
         let extractc = -extractc;
         fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -33729,8 +33698,7 @@ pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -
         let extractc = -fmsub;
         fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(c, 0, fmsub);
-    transmute(r)
+    simd_insert(c, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33747,8 +33715,7 @@ pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d)
         let extractc = -extractc;
         fmsub = vfmadd132sd(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33766,8 +33733,7 @@ pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
         let extractc = -extractc;
         fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -33784,8 +33750,7 @@ pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
         let extractc = -fmsub;
         fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(c, 0, fmsub);
-    transmute(r)
+    simd_insert(c, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33802,8 +33767,7 @@ pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -
         let extractc: f32 = simd_extract(c, 0);
         fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33821,8 +33785,7 @@ pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128)
         let extractc: f32 = simd_extract(c, 0);
         fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -33839,8 +33802,7 @@ pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8)
         let extractb: f32 = simd_extract(b, 0);
         fnmadd = vfmadd132ss(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(c, 0, fnmadd);
-    transmute(r)
+    simd_insert(c, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33857,8 +33819,7 @@ pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
         let extractc: f64 = simd_extract(c, 0);
         fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33876,8 +33837,7 @@ pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
         let extractc: f64 = simd_extract(c, 0);
         fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -33894,8 +33854,7 @@ pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
         let extractb: f64 = simd_extract(b, 0);
         fnmadd = vfmadd132sd(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(c, 0, fnmadd);
-    transmute(r)
+    simd_insert(c, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33913,8 +33872,7 @@ pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -
         let extractc = -extractc;
         fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33933,8 +33891,7 @@ pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128)
         let extractc = -extractc;
         fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -33952,8 +33909,7 @@ pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8)
         let extractc = -fnmsub;
         fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(c, 0, fnmsub);
-    transmute(r)
+    simd_insert(c, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33971,8 +33927,7 @@ pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
         let extractc = -extractc;
         fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33991,8 +33946,7 @@ pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
         let extractc = -extractc;
         fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -34010,8 +33964,7 @@ pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
         let extractc = -fnmsub;
         fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    let r = simd_insert(c, 0, fnmsub);
-    transmute(r)
+    simd_insert(c, 0, fnmsub)
 }
 
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35705,8 +35658,7 @@ pub unsafe fn _mm_fmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: _
     let extractb: f32 = simd_extract(b, 0);
     let extractc: f32 = simd_extract(c, 0);
     let r = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
-    let r = simd_insert(a, 0, r);
-    transmute(r)
+    simd_insert(a, 0, r)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35736,8 +35688,7 @@ pub unsafe fn _mm_mask_fmadd_round_ss<const ROUNDING: i32>(
         let extractc: f32 = simd_extract(c, 0);
         fmadd = vfmadd132ss(fmadd, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+    simd_insert(a, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35768,8 +35719,7 @@ pub unsafe fn _mm_maskz_fmadd_round_ss<const ROUNDING: i32>(
         let extractc: f32 = simd_extract(c, 0);
         fmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+    simd_insert(a, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -35799,8 +35749,7 @@ pub unsafe fn _mm_mask3_fmadd_round_ss<const ROUNDING: i32>(
         let extractb: f32 = simd_extract(b, 0);
         fmadd = vfmadd132ss(extracta, extractb, fmadd, ROUNDING);
     }
-    let r = simd_insert(c, 0, fmadd);
-    transmute(r)
+    simd_insert(c, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -35827,8 +35776,7 @@ pub unsafe fn _mm_fmadd_round_sd<const ROUNDING: i32>(
     let extractb: f64 = simd_extract(b, 0);
     let extractc: f64 = simd_extract(c, 0);
     let fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+    simd_insert(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -35858,8 +35806,7 @@ pub unsafe fn _mm_mask_fmadd_round_sd<const ROUNDING: i32>(
         let extractc: f64 = simd_extract(c, 0);
         fmadd = vfmadd132sd(fmadd, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+    simd_insert(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -35890,8 +35837,7 @@ pub unsafe fn _mm_maskz_fmadd_round_sd<const ROUNDING: i32>(
         let extractc: f64 = simd_extract(c, 0);
         fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fmadd);
-    transmute(r)
+    simd_insert(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -35921,8 +35867,7 @@ pub unsafe fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
         let extractb: f64 = simd_extract(b, 0);
         fmadd = vfmadd132sd(extracta, extractb, fmadd, ROUNDING);
     }
-    let r = simd_insert(c, 0, fmadd);
-    transmute(r)
+    simd_insert(c, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35946,8 +35891,7 @@ pub unsafe fn _mm_fmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: _
     let extractc: f32 = simd_extract(c, 0);
     let extractc = -extractc;
     let fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35978,8 +35922,7 @@ pub unsafe fn _mm_mask_fmsub_round_ss<const ROUNDING: i32>(
         let extractc = -extractc;
         fmsub = vfmadd132ss(fmsub, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36011,8 +35954,7 @@ pub unsafe fn _mm_maskz_fmsub_round_ss<const ROUNDING: i32>(
         let extractc = -extractc;
         fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -36043,8 +35985,7 @@ pub unsafe fn _mm_mask3_fmsub_round_ss<const ROUNDING: i32>(
         let extractc = -fmsub;
         fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(c, 0, fmsub);
-    transmute(r)
+    simd_insert(c, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -36072,8 +36013,7 @@ pub unsafe fn _mm_fmsub_round_sd<const ROUNDING: i32>(
     let extractc: f64 = simd_extract(c, 0);
     let extractc = -extractc;
     let fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36104,8 +36044,7 @@ pub unsafe fn _mm_mask_fmsub_round_sd<const ROUNDING: i32>(
         let extractc = -extractc;
         fmsub = vfmadd132sd(fmsub, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36137,8 +36076,7 @@ pub unsafe fn _mm_maskz_fmsub_round_sd<const ROUNDING: i32>(
         let extractc = -extractc;
         fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fmsub);
-    transmute(r)
+    simd_insert(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -36169,8 +36107,7 @@ pub unsafe fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
         let extractc = -fmsub;
         fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(c, 0, fmsub);
-    transmute(r)
+    simd_insert(c, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36194,8 +36131,7 @@ pub unsafe fn _mm_fnmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c:
     let extractb: f32 = simd_extract(b, 0);
     let extractc: f32 = simd_extract(c, 0);
     let fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36226,8 +36162,7 @@ pub unsafe fn _mm_mask_fnmadd_round_ss<const ROUNDING: i32>(
         let extractc: f32 = simd_extract(c, 0);
         fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36259,8 +36194,7 @@ pub unsafe fn _mm_maskz_fnmadd_round_ss<const ROUNDING: i32>(
         let extractc: f32 = simd_extract(c, 0);
         fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -36291,8 +36225,7 @@ pub unsafe fn _mm_mask3_fnmadd_round_ss<const ROUNDING: i32>(
         let extractb: f32 = simd_extract(b, 0);
         fnmadd = vfmadd132ss(extracta, extractb, fnmadd, ROUNDING);
     }
-    let r = simd_insert(c, 0, fnmadd);
-    transmute(r)
+    simd_insert(c, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -36320,8 +36253,7 @@ pub unsafe fn _mm_fnmadd_round_sd<const ROUNDING: i32>(
     let extractb: f64 = simd_extract(b, 0);
     let extractc: f64 = simd_extract(c, 0);
     let fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36352,8 +36284,7 @@ pub unsafe fn _mm_mask_fnmadd_round_sd<const ROUNDING: i32>(
         let extractc: f64 = simd_extract(c, 0);
         fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36385,8 +36316,7 @@ pub unsafe fn _mm_maskz_fnmadd_round_sd<const ROUNDING: i32>(
         let extractc: f64 = simd_extract(c, 0);
         fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fnmadd);
-    transmute(r)
+    simd_insert(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -36417,8 +36347,7 @@ pub unsafe fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
         let extractb: f64 = simd_extract(b, 0);
         fnmadd = vfmadd132sd(extracta, extractb, fnmadd, ROUNDING);
     }
-    let r = simd_insert(c, 0, fnmadd);
-    transmute(r)
+    simd_insert(c, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36443,8 +36372,7 @@ pub unsafe fn _mm_fnmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c:
     let extractc: f32 = simd_extract(c, 0);
     let extractc = -extractc;
     let fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36476,8 +36404,7 @@ pub unsafe fn _mm_mask_fnmsub_round_ss<const ROUNDING: i32>(
         let extractc = -extractc;
         fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36510,8 +36437,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_ss<const ROUNDING: i32>(
         let extractc = -extractc;
         fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -36543,8 +36469,7 @@ pub unsafe fn _mm_mask3_fnmsub_round_ss<const ROUNDING: i32>(
         let extractc = -fnmsub;
         fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(c, 0, fnmsub);
-    transmute(r)
+    simd_insert(c, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -36573,8 +36498,7 @@ pub unsafe fn _mm_fnmsub_round_sd<const ROUNDING: i32>(
     let extractc: f64 = simd_extract(c, 0);
     let extractc = -extractc;
     let fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36606,8 +36530,7 @@ pub unsafe fn _mm_mask_fnmsub_round_sd<const ROUNDING: i32>(
         let extractc = -extractc;
         fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36640,8 +36563,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_sd<const ROUNDING: i32>(
         let extractc = -extractc;
         fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(a, 0, fnmsub);
-    transmute(r)
+    simd_insert(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -36673,8 +36595,7 @@ pub unsafe fn _mm_mask3_fnmsub_round_sd<const ROUNDING: i32>(
         let extractc = -fnmsub;
         fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    let r = simd_insert(c, 0, fnmsub);
-    transmute(r)
+    simd_insert(c, 0, fnmsub)
 }
 
 /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
@@ -37168,8 +37089,7 @@ pub unsafe fn _mm_maskz_cvt_roundsd_ss<const ROUNDING: i32>(
 pub unsafe fn _mm_cvt_roundss_si32<const ROUNDING: i32>(a: __m128) -> i32 {
     static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    let r = vcvtss2si(a, ROUNDING);
-    transmute(r)
+    vcvtss2si(a, ROUNDING)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
@@ -37188,8 +37108,7 @@ pub unsafe fn _mm_cvt_roundss_si32<const ROUNDING: i32>(a: __m128) -> i32 {
 pub unsafe fn _mm_cvt_roundss_i32<const ROUNDING: i32>(a: __m128) -> i32 {
     static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    let r = vcvtss2si(a, ROUNDING);
-    transmute(r)
+    vcvtss2si(a, ROUNDING)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
@@ -37208,8 +37127,7 @@ pub unsafe fn _mm_cvt_roundss_i32<const ROUNDING: i32>(a: __m128) -> i32 {
 pub unsafe fn _mm_cvt_roundss_u32<const ROUNDING: i32>(a: __m128) -> u32 {
     static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    let r = vcvtss2usi(a, ROUNDING);
-    transmute(r)
+    vcvtss2usi(a, ROUNDING)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
@@ -37219,7 +37137,7 @@ pub unsafe fn _mm_cvt_roundss_u32<const ROUNDING: i32>(a: __m128) -> u32 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtss2si))]
 pub unsafe fn _mm_cvtss_i32(a: __m128) -> i32 {
-    transmute(vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+    vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
@@ -37229,7 +37147,7 @@ pub unsafe fn _mm_cvtss_i32(a: __m128) -> i32 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtss2usi))]
 pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
-    transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+    vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
@@ -37248,8 +37166,7 @@ pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
 pub unsafe fn _mm_cvt_roundsd_si32<const ROUNDING: i32>(a: __m128d) -> i32 {
     static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
-    let r = vcvtsd2si(a, ROUNDING);
-    transmute(r)
+    vcvtsd2si(a, ROUNDING)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
@@ -37268,8 +37185,7 @@ pub unsafe fn _mm_cvt_roundsd_si32<const ROUNDING: i32>(a: __m128d) -> i32 {
 pub unsafe fn _mm_cvt_roundsd_i32<const ROUNDING: i32>(a: __m128d) -> i32 {
     static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
-    let r = vcvtsd2si(a, ROUNDING);
-    transmute(r)
+    vcvtsd2si(a, ROUNDING)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
@@ -37288,8 +37204,7 @@ pub unsafe fn _mm_cvt_roundsd_i32<const ROUNDING: i32>(a: __m128d) -> i32 {
 pub unsafe fn _mm_cvt_roundsd_u32<const ROUNDING: i32>(a: __m128d) -> u32 {
     static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
-    let r = vcvtsd2usi(a, ROUNDING);
-    transmute(r)
+    vcvtsd2usi(a, ROUNDING)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
@@ -37299,7 +37214,7 @@ pub unsafe fn _mm_cvt_roundsd_u32<const ROUNDING: i32>(a: __m128d) -> u32 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtsd2si))]
 pub unsafe fn _mm_cvtsd_i32(a: __m128d) -> i32 {
-    transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+    vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
@@ -37309,7 +37224,7 @@ pub unsafe fn _mm_cvtsd_i32(a: __m128d) -> i32 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtsd2usi))]
 pub unsafe fn _mm_cvtsd_u32(a: __m128d) -> u32 {
-    transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+    vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -37382,8 +37297,7 @@ pub unsafe fn _mm_cvt_roundu32_ss<const ROUNDING: i32>(a: __m128, b: u32) -> __m
 #[cfg_attr(test, assert_instr(vcvtsi2ss))]
 pub unsafe fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
     let b = b as f32;
-    let r = simd_insert(a, 0, b);
-    transmute(r)
+    simd_insert(a, 0, b)
 }
 
 /// Convert the signed 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -37394,8 +37308,7 @@ pub unsafe fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
 #[cfg_attr(test, assert_instr(vcvtsi2sd))]
 pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
     let b = b as f64;
-    let r = simd_insert(a, 0, b);
-    transmute(r)
+    simd_insert(a, 0, b)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -37409,8 +37322,7 @@ pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
 pub unsafe fn _mm_cvtt_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
     static_assert_sae!(SAE);
     let a = a.as_f32x4();
-    let r = vcvtss2si(a, SAE);
-    transmute(r)
+    vcvtss2si(a, SAE)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -37424,8 +37336,7 @@ pub unsafe fn _mm_cvtt_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
 pub unsafe fn _mm_cvtt_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
     static_assert_sae!(SAE);
     let a = a.as_f32x4();
-    let r = vcvtss2si(a, SAE);
-    transmute(r)
+    vcvtss2si(a, SAE)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
@@ -37439,8 +37350,7 @@ pub unsafe fn _mm_cvtt_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
 pub unsafe fn _mm_cvtt_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
     static_assert_sae!(SAE);
     let a = a.as_f32x4();
-    let r = vcvtss2usi(a, SAE);
-    transmute(r)
+    vcvtss2usi(a, SAE)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
@@ -37450,7 +37360,7 @@ pub unsafe fn _mm_cvtt_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtss2si))]
 pub unsafe fn _mm_cvttss_i32(a: __m128) -> i32 {
-    transmute(vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+    vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
@@ -37460,7 +37370,7 @@ pub unsafe fn _mm_cvttss_i32(a: __m128) -> i32 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtss2usi))]
 pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 {
-    transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+    vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -37474,8 +37384,7 @@ pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 {
 pub unsafe fn _mm_cvtt_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
     static_assert_sae!(SAE);
     let a = a.as_f64x2();
-    let r = vcvtsd2si(a, SAE);
-    transmute(r)
+    vcvtsd2si(a, SAE)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -37489,8 +37398,7 @@ pub unsafe fn _mm_cvtt_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
 pub unsafe fn _mm_cvtt_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
     static_assert_sae!(SAE);
     let a = a.as_f64x2();
-    let r = vcvtsd2si(a, SAE);
-    transmute(r)
+    vcvtsd2si(a, SAE)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
@@ -37504,8 +37412,7 @@ pub unsafe fn _mm_cvtt_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
 pub unsafe fn _mm_cvtt_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
     static_assert_sae!(SAE);
     let a = a.as_f64x2();
-    let r = vcvtsd2usi(a, SAE);
-    transmute(r)
+    vcvtsd2usi(a, SAE)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
@@ -37515,7 +37422,7 @@ pub unsafe fn _mm_cvtt_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtsd2si))]
 pub unsafe fn _mm_cvttsd_i32(a: __m128d) -> i32 {
-    transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+    vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
@@ -37525,7 +37432,7 @@ pub unsafe fn _mm_cvttsd_i32(a: __m128d) -> i32 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtsd2usi))]
 pub unsafe fn _mm_cvttsd_u32(a: __m128d) -> u32 {
-    transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+    vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
 }
 
 /// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -37536,8 +37443,7 @@ pub unsafe fn _mm_cvttsd_u32(a: __m128d) -> u32 {
 #[cfg_attr(test, assert_instr(vcvtusi2ss))]
 pub unsafe fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
     let b = b as f32;
-    let r = simd_insert(a, 0, b);
-    transmute(r)
+    simd_insert(a, 0, b)
 }
 
 /// Convert the unsigned 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -37548,8 +37454,7 @@ pub unsafe fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
 #[cfg_attr(test, assert_instr(vcvtusi2sd))]
 pub unsafe fn _mm_cvtu32_sd(a: __m128d, b: u32) -> __m128d {
     let b = b as f64;
-    let r = simd_insert(a, 0, b);
-    transmute(r)
+    simd_insert(a, 0, b)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
@@ -37565,8 +37470,7 @@ pub unsafe fn _mm_comi_round_ss<const IMM5: i32, const SAE: i32>(a: __m128, b: _
     static_assert_mantissas_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
-    let r = vcomiss(a, b, IMM5, SAE);
-    transmute(r)
+    vcomiss(a, b, IMM5, SAE)
 }
 
 /// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
@@ -37582,8 +37486,7 @@ pub unsafe fn _mm_comi_round_sd<const IMM5: i32, const SAE: i32>(a: __m128d, b:
     static_assert_mantissas_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
-    let r = vcomisd(a, b, IMM5, SAE);
-    transmute(r)
+    vcomisd(a, b, IMM5, SAE)
 }
 
 /// Equal
diff --git a/library/stdarch/crates/core_arch/src/x86/sse.rs b/library/stdarch/crates/core_arch/src/x86/sse.rs
index 3d4471ba3..6a2be0921 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse.rs
@@ -790,8 +790,7 @@ pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
 ///
 /// The result is rounded according to the current rounding mode. If the result
 /// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
-/// (`i32::MIN`) or an invalid operation floating point exception if
-/// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
+/// (`i32::MIN`).
 ///
 /// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
 ///
@@ -821,8 +820,7 @@ pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
 ///
 /// The result is rounded always using truncation (round towards zero). If the
 /// result cannot be represented as a 32 bit integer the result will be
-/// `0x8000_0000` (`i32::MIN`) or an invalid operation floating point
-/// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
+/// `0x8000_0000` (`i32::MIN`).
 ///
 /// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
 ///
@@ -1083,7 +1081,10 @@ pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(movmskps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
-    movmskps(a)
+    // Propagate the highest bit to the rest, because simd_bitmask
+    // requires all-1 or all-0.
+    let mask: i32x4 = simd_lt(transmute(a), i32x4::splat(0));
+    simd_bitmask::<i32x4, u8>(mask).into()
 }
 
 /// Construct a `__m128` with the lowest element read from `p` and the other
@@ -1365,6 +1366,15 @@ pub unsafe fn _mm_sfence() {
 
 /// Gets the unsigned 32-bit value of the MXCSR control and status register.
 ///
+/// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust
+/// floating-point operations may or may not result in this register getting updated with exception
+/// state, and the register can change between two invocations of this function even when no
+/// floating-point operations appear in the source code (since floating-point operations appearing
+/// earlier or later can be reordered).
+///
+/// If you need to perform some floating-point operations and check whether they raised an
+/// exception, use an inline assembly block for the entire sequence of operations.
+///
 /// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr)
@@ -1372,6 +1382,10 @@ pub unsafe fn _mm_sfence() {
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(stmxcsr))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _mm_getcsr() -> u32 {
     let mut result = 0_i32;
     stmxcsr(&mut result as *mut _ as *mut i8);
@@ -1401,6 +1415,16 @@ pub unsafe fn _mm_getcsr() -> u32 {
 /// * The *denormals-are-zero mode flag* turns all numbers which would be
 /// denormalized (exponent bits are all zeros) into zeros.
 ///
+/// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to
+/// **immediate Undefined Behavior**: Rust assumes that these are always in their default state and
+/// will optimize accordingly. This even applies when the register is altered and later reset to its
+/// original value without any floating-point operations appearing in the source code between those
+/// operations (since floating-point operations appearing earlier or later can be reordered).
+///
+/// If you need to perform some floating-point operations under a different masking flags, rounding
+/// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the
+/// original MXCSR register state before the end of the block.
+///
 /// ## Exception Flags
 ///
 /// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
@@ -1509,6 +1533,10 @@ pub unsafe fn _mm_getcsr() -> u32 {
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ldmxcsr))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _mm_setcsr(val: u32) {
     ldmxcsr(&val as *const _ as *const i8);
 }
@@ -1588,9 +1616,14 @@ pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK)
 #[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
     _mm_getcsr() & _MM_MASK_MASK
 }
@@ -1599,9 +1632,14 @@ pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE)
 #[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
     _mm_getcsr() & _MM_EXCEPT_MASK
 }
@@ -1610,9 +1648,14 @@ pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE)
 #[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
     _mm_getcsr() & _MM_FLUSH_ZERO_MASK
 }
@@ -1621,9 +1664,14 @@ pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE)
 #[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
     _mm_getcsr() & _MM_ROUND_MASK
 }
@@ -1632,9 +1680,14 @@ pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK)
 #[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
     _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x)
 }
@@ -1643,9 +1696,14 @@ pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE)
 #[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
     _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x)
 }
@@ -1654,9 +1712,14 @@ pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE)
 #[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
     let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x;
     // println!("setting csr={:x}", val);
@@ -1667,9 +1730,14 @@ pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE)
 #[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+    since = "1.75.0",
+    note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
 pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
     _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x)
 }
@@ -1820,8 +1888,6 @@ extern "C" {
     fn maxss(a: __m128, b: __m128) -> __m128;
     #[link_name = "llvm.x86.sse.max.ps"]
     fn maxps(a: __m128, b: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse.movmsk.ps"]
-    fn movmskps(a: __m128) -> i32;
     #[link_name = "llvm.x86.sse.cmp.ps"]
     fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
     #[link_name = "llvm.x86.sse.comieq.ss"]
@@ -1974,7 +2040,11 @@ mod tests {
         let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
         let r = _mm_rcp_ss(a);
         let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
-        assert_eq_m128(r, e);
+        let rel_err = 0.00048828125;
+        assert_approx_eq!(get_m128(r, 0), get_m128(e, 0), 2. * rel_err);
+        for i in 1..4 {
+            assert_eq!(get_m128(r, i), get_m128(e, i));
+        }
     }
 
     #[simd_test(enable = "sse")]
@@ -2055,6 +2125,17 @@ mod tests {
         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
         let r = _mm_max_ps(a, b);
         assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
+
+        // Check SSE-specific semantics for -0.0 handling.
+        let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
+        let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
+        let r1: [u8; 16] = transmute(_mm_max_ps(a, b));
+        let r2: [u8; 16] = transmute(_mm_max_ps(b, a));
+        let a: [u8; 16] = transmute(a);
+        let b: [u8; 16] = transmute(b);
+        assert_eq!(r1, b);
+        assert_eq!(r2, a);
+        assert_ne!(a, b); // sanity check that -0.0 is actually present
     }
 
     #[simd_test(enable = "sse")]
@@ -2098,12 +2179,12 @@ mod tests {
         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
         let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
         let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
-        let e: u32x4 = transmute(_mm_setr_ps(transmute(0u32), 2.0, 3.0, 4.0));
+        let e: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0), 2.0, 3.0, 4.0));
         assert_eq!(r, e);
 
         let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
         let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
-        let e2: u32x4 = transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0));
+        let e2: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0xffffffff), 2.0, 3.0, 4.0));
         assert_eq!(r2, e2);
     }
 
@@ -2119,15 +2200,15 @@ mod tests {
         let d1 = !0u32; // a.extract(0) < d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2143,15 +2224,15 @@ mod tests {
         let d1 = !0u32; // a.extract(0) <= d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2167,15 +2248,15 @@ mod tests {
         let d1 = 0u32; // a.extract(0) > d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2191,15 +2272,15 @@ mod tests {
         let d1 = 0u32; // a.extract(0) >= d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2215,15 +2296,15 @@ mod tests {
         let d1 = !0u32; // a.extract(0) != d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2244,15 +2325,15 @@ mod tests {
         let d1 = 0u32; // a.extract(0) >= d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2273,15 +2354,15 @@ mod tests {
         let d1 = 0u32; // a.extract(0) > d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2302,15 +2383,15 @@ mod tests {
         let d1 = !0u32; // a.extract(0) <= d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2331,15 +2412,15 @@ mod tests {
         let d1 = !0u32; // a.extract(0) < d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2355,15 +2436,15 @@ mod tests {
         let d1 = !0u32; // a.extract(0) ord d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2379,15 +2460,15 @@ mod tests {
         let d1 = 0u32; // a.extract(0) unord d.extract(0)
 
         let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
-        let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
         assert_eq!(rb, eb);
 
         let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
-        let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
         assert_eq!(rc, ec);
 
         let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
-        let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
         assert_eq!(rd, ed);
     }
 
@@ -2766,7 +2847,9 @@ mod tests {
         }
     }
 
+    #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions
     #[simd_test(enable = "sse")]
+    #[cfg_attr(miri, ignore)] // Uses _mm_setcsr, which is not supported by Miri
     unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() {
         // If one of the arguments is a quiet NaN `comieq_ss` should signal an
         // Invalid Operation Exception while `ucomieq_ss` should not.
@@ -3072,7 +3155,7 @@ mod tests {
         let mut p = vals.as_mut_ptr();
 
         if (p as usize) & 0xf != 0 {
-            ofs = ((16 - (p as usize)) & 0xf) >> 2;
+            ofs = (16 - ((p as usize) & 0xf)) >> 2;
             p = p.add(ofs);
         }
 
@@ -3098,7 +3181,7 @@ mod tests {
 
         // Align p to 16-byte boundary
         if (p as usize) & 0xf != 0 {
-            ofs = ((16 - (p as usize)) & 0xf) >> 2;
+            ofs = (16 - ((p as usize) & 0xf)) >> 2;
             p = p.add(ofs);
         }
 
@@ -3124,7 +3207,7 @@ mod tests {
 
         // Align p to 16-byte boundary
         if (p as usize) & 0xf != 0 {
-            ofs = ((16 - (p as usize)) & 0xf) >> 2;
+            ofs = (16 - ((p as usize) & 0xf)) >> 2;
             p = p.add(ofs);
         }
 
@@ -3186,11 +3269,15 @@ mod tests {
     }
 
     #[simd_test(enable = "sse")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    #[cfg_attr(miri, ignore)]
     unsafe fn test_mm_sfence() {
         _mm_sfence();
     }
 
+    #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
     #[simd_test(enable = "sse")]
+    #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
     unsafe fn test_mm_getcsr_setcsr_1() {
         let saved_csr = _mm_getcsr();
 
@@ -3206,7 +3293,9 @@ mod tests {
         assert_eq_m128(r, exp); // first component is a denormalized f32
     }
 
+    #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
     #[simd_test(enable = "sse")]
+    #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
     unsafe fn test_mm_getcsr_setcsr_2() {
         // Same as _mm_setcsr_1 test, but with opposite flag value.
 
@@ -3224,7 +3313,9 @@ mod tests {
         assert_eq_m128(r, exp); // first component is a denormalized f32
     }
 
+    #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
     #[simd_test(enable = "sse")]
+    #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
     unsafe fn test_mm_getcsr_setcsr_underflow() {
         _MM_SET_EXCEPTION_STATE(0);
 
@@ -3263,6 +3354,9 @@ mod tests {
     }
 
     #[simd_test(enable = "sse")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
     unsafe fn test_mm_stream_ps() {
         let a = _mm_set1_ps(7.0);
         let mut mem = Memory { data: [-1.0; 4] };
diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs
index 3d572a1f5..7831ea743 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs
@@ -165,7 +165,10 @@ pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pavgb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pavgb(a.as_u8x16(), b.as_u8x16()))
+    let a = simd_cast::<_, u16x16>(a.as_u8x16());
+    let b = simd_cast::<_, u16x16>(b.as_u8x16());
+    let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
+    transmute(simd_cast::<_, u8x16>(r))
 }
 
 /// Averages packed unsigned 16-bit integers in `a` and `b`.
@@ -176,7 +179,10 @@ pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pavgw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pavgw(a.as_u16x8(), b.as_u16x8()))
+    let a = simd_cast::<_, u32x8>(a.as_u16x8());
+    let b = simd_cast::<_, u32x8>(b.as_u16x8());
+    let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
+    transmute(simd_cast::<_, u16x8>(r))
 }
 
 /// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
@@ -261,7 +267,10 @@ pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pmulhw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pmulhw(a.as_i16x8(), b.as_i16x8()))
+    let a = simd_cast::<_, i32x8>(a.as_i16x8());
+    let b = simd_cast::<_, i32x8>(b.as_i16x8());
+    let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
+    transmute(simd_cast::<i32x8, i16x8>(r))
 }
 
 /// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
@@ -275,7 +284,10 @@ pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pmulhuw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pmulhuw(a.as_u16x8(), b.as_u16x8()))
+    let a = simd_cast::<_, u32x8>(a.as_u16x8());
+    let b = simd_cast::<_, u32x8>(b.as_u16x8());
+    let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
+    transmute(simd_cast::<u32x8, u16x8>(r))
 }
 
 /// Multiplies the packed 16-bit integers in `a` and `b`.
@@ -303,7 +315,10 @@ pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pmuludq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pmuludq(a.as_u32x4(), b.as_u32x4()))
+    let a = a.as_u64x2();
+    let b = b.as_u64x2();
+    let mask = u64x2::splat(u32::MAX.into());
+    transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
 }
 
 /// Sum the absolute differences of packed unsigned 8-bit integers.
@@ -952,7 +967,7 @@ pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
 #[cfg_attr(test, assert_instr(cvtdq2ps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
-    cvtdq2ps(a.as_i32x4())
+    transmute(simd_cast::<_, f32x4>(a.as_i32x4()))
 }
 
 /// Converts packed single-precision (32-bit) floating-point elements in `a`
@@ -2240,7 +2255,9 @@ pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
 #[cfg_attr(test, assert_instr(cvtpd2ps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
-    cvtpd2ps(a)
+    let r = simd_cast::<_, f32x2>(a.as_f64x2());
+    let zero = f32x2::new(0.0, 0.0);
+    transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
 }
 
 /// Converts packed single-precision (32-bit) floating-point elements in `a` to
@@ -2253,7 +2270,8 @@ pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
 #[cfg_attr(test, assert_instr(cvtps2pd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtps_pd(a: __m128) -> __m128d {
-    cvtps2pd(a)
+    let a = a.as_f32x4();
+    transmute(simd_cast::<f32x2, f64x2>(simd_shuffle!(a, a, [0, 1])))
 }
 
 /// Converts packed double-precision (64-bit) floating-point elements in `a` to
@@ -2432,7 +2450,10 @@ pub unsafe fn _mm_setzero_pd() -> __m128d {
 #[cfg_attr(test, assert_instr(movmskpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 {
-    movmskpd(a)
+    // Propagate the highest bit to the rest, because simd_bitmask
+    // requires all-1 or all-0.
+    let mask: i64x2 = simd_lt(transmute(a), i64x2::splat(0));
+    simd_bitmask::<i64x2, u8>(mask).into()
 }
 
 /// Loads 128-bits (composed of 2 packed double-precision (64-bit)
@@ -2826,18 +2847,8 @@ extern "C" {
     fn lfence();
     #[link_name = "llvm.x86.sse2.mfence"]
     fn mfence();
-    #[link_name = "llvm.x86.sse2.pavg.b"]
-    fn pavgb(a: u8x16, b: u8x16) -> u8x16;
-    #[link_name = "llvm.x86.sse2.pavg.w"]
-    fn pavgw(a: u16x8, b: u16x8) -> u16x8;
     #[link_name = "llvm.x86.sse2.pmadd.wd"]
     fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
-    #[link_name = "llvm.x86.sse2.pmulh.w"]
-    fn pmulhw(a: i16x8, b: i16x8) -> i16x8;
-    #[link_name = "llvm.x86.sse2.pmulhu.w"]
-    fn pmulhuw(a: u16x8, b: u16x8) -> u16x8;
-    #[link_name = "llvm.x86.sse2.pmulu.dq"]
-    fn pmuludq(a: u32x4, b: u32x4) -> u64x2;
     #[link_name = "llvm.x86.sse2.psad.bw"]
     fn psadbw(a: u8x16, b: u8x16) -> u64x2;
     #[link_name = "llvm.x86.sse2.psll.w"]
@@ -2856,8 +2867,6 @@ extern "C" {
     fn psrld(a: i32x4, count: i32x4) -> i32x4;
     #[link_name = "llvm.x86.sse2.psrl.q"]
     fn psrlq(a: i64x2, count: i64x2) -> i64x2;
-    #[link_name = "llvm.x86.sse2.cvtdq2ps"]
-    fn cvtdq2ps(a: i32x4) -> __m128;
     #[link_name = "llvm.x86.sse2.cvtps2dq"]
     fn cvtps2dq(a: __m128) -> i32x4;
     #[link_name = "llvm.x86.sse2.maskmov.dqu"]
@@ -2908,12 +2917,6 @@ extern "C" {
     fn ucomigesd(a: __m128d, b: __m128d) -> i32;
     #[link_name = "llvm.x86.sse2.ucomineq.sd"]
     fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
-    #[link_name = "llvm.x86.sse2.movmsk.pd"]
-    fn movmskpd(a: __m128d) -> i32;
-    #[link_name = "llvm.x86.sse2.cvtpd2ps"]
-    fn cvtpd2ps(a: __m128d) -> __m128;
-    #[link_name = "llvm.x86.sse2.cvtps2pd"]
-    fn cvtps2pd(a: __m128) -> __m128d;
     #[link_name = "llvm.x86.sse2.cvtpd2dq"]
     fn cvtpd2dq(a: __m128d) -> i32x4;
     #[link_name = "llvm.x86.sse2.cvtsd2si"]
@@ -2956,11 +2959,15 @@ mod tests {
     }
 
     #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    #[cfg_attr(miri, ignore)]
     unsafe fn test_mm_lfence() {
         _mm_lfence();
     }
 
     #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    #[cfg_attr(miri, ignore)]
     unsafe fn test_mm_mfence() {
         _mm_mfence();
     }
@@ -3343,83 +3350,124 @@ mod tests {
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_slli_epi16() {
-        #[rustfmt::skip]
-        let a = _mm_setr_epi16(
-            0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0,
-        );
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
         let r = _mm_slli_epi16::<4>(a);
-
-        #[rustfmt::skip]
-        let e = _mm_setr_epi16(
-            0xFFF0 as u16 as i16, 0xFFF0 as u16 as i16, 0x0FF0, 0x00F0,
-            0, 0, 0, 0,
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
         );
-        assert_eq_m128i(r, e);
+        let r = _mm_slli_epi16::<16>(a);
+        assert_eq_m128i(r, _mm_set1_epi16(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_sll_epi16() {
-        let a = _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0);
-        let r = _mm_sll_epi16(a, _mm_setr_epi16(4, 0, 0, 0, 0, 0, 0, 0));
-        assert_eq_m128i(r, _mm_setr_epi16(0xFF0, 0, 0, 0, 0, 0, 0, 0));
-        let r = _mm_sll_epi16(a, _mm_setr_epi16(0, 0, 0, 0, 4, 0, 0, 0));
-        assert_eq_m128i(r, _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0));
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
+        );
+        let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_set1_epi16(0));
+        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi16(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_slli_epi32() {
-        let r = _mm_slli_epi32::<4>(_mm_set1_epi32(0xFFFF));
-        assert_eq_m128i(r, _mm_set1_epi32(0xFFFF0));
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_slli_epi32::<4>(a);
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
+        let r = _mm_slli_epi32::<32>(a);
+        assert_eq_m128i(r, _mm_set1_epi32(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_sll_epi32() {
-        let a = _mm_set1_epi32(0xFFFF);
-        let b = _mm_setr_epi32(4, 0, 0, 0);
-        let r = _mm_sll_epi32(a, b);
-        assert_eq_m128i(r, _mm_set1_epi32(0xFFFF0));
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
+        let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m128i(r, _mm_set1_epi32(0));
+        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi32(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_slli_epi64() {
-        let r = _mm_slli_epi64::<4>(_mm_set1_epi64x(0xFFFFFFFF));
-        assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFFF0));
+        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm_slli_epi64::<4>(a);
+        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
+        let r = _mm_slli_epi64::<64>(a);
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_sll_epi64() {
-        let a = _mm_set1_epi64x(0xFFFFFFFF);
-        let b = _mm_setr_epi64x(4, 0);
-        let r = _mm_sll_epi64(a, b);
-        assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFFF0));
+        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
+        let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
+        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_srai_epi16() {
-        let r = _mm_srai_epi16::<1>(_mm_set1_epi16(-1));
-        assert_eq_m128i(r, _mm_set1_epi16(-1));
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+        let r = _mm_srai_epi16::<4>(a);
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
+        );
+        let r = _mm_srai_epi16::<16>(a);
+        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_sra_epi16() {
-        let a = _mm_set1_epi16(-1);
-        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
-        let r = _mm_sra_epi16(a, b);
-        assert_eq_m128i(r, _mm_set1_epi16(-1));
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
+        );
+        let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
+        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_srai_epi32() {
-        let r = _mm_srai_epi32::<1>(_mm_set1_epi32(-1));
-        assert_eq_m128i(r, _mm_set1_epi32(-1));
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_srai_epi32::<4>(a);
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
+        let r = _mm_srai_epi32::<32>(a);
+        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_sra_epi32() {
-        let a = _mm_set1_epi32(-1);
-        let b = _mm_setr_epi32(1, 0, 0, 0);
-        let r = _mm_sra_epi32(a, b);
-        assert_eq_m128i(r, _mm_set1_epi32(-1));
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
+        let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
+        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
     }
 
     #[simd_test(enable = "sse2")]
@@ -3453,53 +3501,74 @@ mod tests {
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_srli_epi16() {
-        #[rustfmt::skip]
-        let a = _mm_setr_epi16(
-            0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0,
-        );
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
         let r = _mm_srli_epi16::<4>(a);
-        #[rustfmt::skip]
-        let e = _mm_setr_epi16(
-            0xFFF as u16 as i16, 0xFF as u16 as i16, 0xF, 0, 0, 0, 0, 0,
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
         );
-        assert_eq_m128i(r, e);
+        let r = _mm_srli_epi16::<16>(a);
+        assert_eq_m128i(r, _mm_set1_epi16(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_srl_epi16() {
-        let a = _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0);
-        let r = _mm_srl_epi16(a, _mm_setr_epi16(4, 0, 0, 0, 0, 0, 0, 0));
-        assert_eq_m128i(r, _mm_setr_epi16(0xF, 0, 0, 0, 0, 0, 0, 0));
-        let r = _mm_srl_epi16(a, _mm_setr_epi16(0, 0, 0, 0, 4, 0, 0, 0));
-        assert_eq_m128i(r, _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0));
+        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(
+            r,
+            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
+        );
+        let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_set1_epi16(0));
+        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi16(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_srli_epi32() {
-        let r = _mm_srli_epi32::<4>(_mm_set1_epi32(0xFFFF));
-        assert_eq_m128i(r, _mm_set1_epi32(0xFFF));
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_srli_epi32::<4>(a);
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
+        let r = _mm_srli_epi32::<32>(a);
+        assert_eq_m128i(r, _mm_set1_epi32(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_srl_epi32() {
-        let a = _mm_set1_epi32(0xFFFF);
-        let b = _mm_setr_epi32(4, 0, 0, 0);
-        let r = _mm_srl_epi32(a, b);
-        assert_eq_m128i(r, _mm_set1_epi32(0xFFF));
+        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
+        let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m128i(r, _mm_set1_epi32(0));
+        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi32(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_srli_epi64() {
-        let r = _mm_srli_epi64::<4>(_mm_set1_epi64x(0xFFFFFFFF));
-        assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFF));
+        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm_srli_epi64::<4>(a);
+        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
+        let r = _mm_srli_epi64::<64>(a);
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
     }
 
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_srl_epi64() {
-        let a = _mm_set1_epi64x(0xFFFFFFFF);
-        let b = _mm_setr_epi64x(4, 0);
-        let r = _mm_srl_epi64(a, b);
-        assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFF));
+        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
+        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
+        let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
+        assert_eq_m128i(r, a);
+        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
+        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m128i(r, _mm_set1_epi64x(0));
     }
 
     #[simd_test(enable = "sse2")]
@@ -3766,6 +3835,9 @@ mod tests {
     }
 
     #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
     unsafe fn test_mm_maskmoveu_si128() {
         let a = _mm_set1_epi8(9);
         #[rustfmt::skip]
@@ -3804,6 +3876,9 @@ mod tests {
     }
 
     #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
     unsafe fn test_mm_stream_si128() {
         let a = _mm_setr_epi32(1, 2, 3, 4);
         let mut r = _mm_undefined_si128();
@@ -3812,6 +3887,9 @@ mod tests {
     }
 
     #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
     unsafe fn test_mm_stream_si32() {
         let a: i32 = 7;
         let mut mem = boxed::Box::<i32>::new(-1);
@@ -4055,6 +4133,17 @@ mod tests {
         let b = _mm_setr_pd(5.0, 10.0);
         let r = _mm_max_pd(a, b);
         assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
+
+        // Check SSE(2)-specific semantics for -0.0 handling.
+        let a = _mm_setr_pd(-0.0, 0.0);
+        let b = _mm_setr_pd(0.0, 0.0);
+        let r1: [u8; 16] = transmute(_mm_max_pd(a, b));
+        let r2: [u8; 16] = transmute(_mm_max_pd(b, a));
+        let a: [u8; 16] = transmute(a);
+        let b: [u8; 16] = transmute(b);
+        assert_eq!(r1, b);
+        assert_eq!(r2, a);
+        assert_ne!(a, b); // sanity check that -0.0 is actually present
     }
 
     #[simd_test(enable = "sse2")]
@@ -4071,6 +4160,17 @@ mod tests {
         let b = _mm_setr_pd(5.0, 10.0);
         let r = _mm_min_pd(a, b);
         assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
+
+        // Check SSE(2)-specific semantics for -0.0 handling.
+        let a = _mm_setr_pd(-0.0, 0.0);
+        let b = _mm_setr_pd(0.0, 0.0);
+        let r1: [u8; 16] = transmute(_mm_min_pd(a, b));
+        let r2: [u8; 16] = transmute(_mm_min_pd(b, a));
+        let a: [u8; 16] = transmute(a);
+        let b: [u8; 16] = transmute(b);
+        assert_eq!(r1, b);
+        assert_eq!(r2, a);
+        assert_ne!(a, b); // sanity check that -0.0 is actually present
     }
 
     #[simd_test(enable = "sse2")]
@@ -4158,7 +4258,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpeq_sd() {
         let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
-        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4166,7 +4266,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmplt_sd() {
         let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
-        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4174,7 +4274,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmple_sd() {
         let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
-        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4182,7 +4282,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpgt_sd() {
         let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
-        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4190,7 +4290,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpge_sd() {
         let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
-        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4198,7 +4298,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpord_sd() {
         let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
-        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4206,7 +4306,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpunord_sd() {
         let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
-        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4214,7 +4314,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpneq_sd() {
         let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
-        let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4222,7 +4322,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpnlt_sd() {
         let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
-        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4230,7 +4330,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpnle_sd() {
         let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
-        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4238,7 +4338,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpngt_sd() {
         let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
-        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4246,7 +4346,7 @@ mod tests {
     #[simd_test(enable = "sse2")]
     unsafe fn test_mm_cmpnge_sd() {
         let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
-        let e = _mm_setr_epi64x(0, transmute(2.0f64));
+        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
         let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b));
         assert_eq_m128i(r, e);
     }
@@ -4478,6 +4578,9 @@ mod tests {
     }
 
     #[simd_test(enable = "sse2")]
+    // Miri cannot support this until it is clear how it fits in the Rust memory model
+    // (non-temporal store)
+    #[cfg_attr(miri, ignore)]
     unsafe fn test_mm_stream_pd() {
         #[repr(align(128))]
         struct Memory {
diff --git a/library/stdarch/crates/core_arch/src/x86/sse3.rs b/library/stdarch/crates/core_arch/src/x86/sse3.rs
index 092a8d9cd..df0d78e5b 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse3.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse3.rs
@@ -1,7 +1,7 @@
 //! Streaming SIMD Extensions 3 (SSE3)
 
 use crate::{
-    core_arch::{simd::*, simd_llvm::simd_shuffle, x86::*},
+    core_arch::{simd::*, simd_llvm::*, x86::*},
     mem::transmute,
 };
 
@@ -17,7 +17,11 @@ use stdarch_test::assert_instr;
 #[cfg_attr(test, assert_instr(addsubps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
-    addsubps(a, b)
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let add = simd_add(a, b);
+    let sub = simd_sub(a, b);
+    simd_shuffle!(add, sub, [4, 1, 6, 3])
 }
 
 /// Alternatively add and subtract packed double-precision (64-bit)
@@ -29,7 +33,11 @@ pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(addsubpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
-    addsubpd(a, b)
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let add = simd_add(a, b);
+    let sub = simd_sub(a, b);
+    simd_shuffle!(add, sub, [2, 1])
 }
 
 /// Horizontally adds adjacent pairs of double-precision (64-bit)
@@ -143,10 +151,6 @@ pub unsafe fn _mm_moveldup_ps(a: __m128) -> __m128 {
 
 #[allow(improper_ctypes)]
 extern "C" {
-    #[link_name = "llvm.x86.sse3.addsub.ps"]
-    fn addsubps(a: __m128, b: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse3.addsub.pd"]
-    fn addsubpd(a: __m128d, b: __m128d) -> __m128d;
     #[link_name = "llvm.x86.sse3.hadd.pd"]
     fn haddpd(a: __m128d, b: __m128d) -> __m128d;
     #[link_name = "llvm.x86.sse3.hadd.ps"]
diff --git a/library/stdarch/crates/core_arch/src/x86/sse41.rs b/library/stdarch/crates/core_arch/src/x86/sse41.rs
index 7ba86e5f7..6d33238b0 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse41.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse41.rs
@@ -62,7 +62,8 @@ pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTI
 #[cfg_attr(test, assert_instr(pblendvb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
-    transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
+    let mask: i8x16 = simd_lt(mask.as_i8x16(), i8x16::splat(0));
+    transmute(simd_select(mask, b.as_i8x16(), a.as_i8x16()))
 }
 
 /// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
@@ -74,15 +75,25 @@ pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16)
 #[inline]
 #[target_feature(enable = "sse4.1")]
-// Note: LLVM7 prefers the single-precision floating-point domain when possible
-// see https://bugs.llvm.org/show_bug.cgi?id=38195
-// #[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xF0))]
-#[cfg_attr(test, assert_instr(blendps, IMM8 = 0xF0))]
+#[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xB1))]
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
-    transmute(pblendw(a.as_i16x8(), b.as_i16x8(), IMM8 as u8))
+    transmute::<i16x8, _>(simd_shuffle!(
+        a.as_i16x8(),
+        b.as_i16x8(),
+        [
+            [0, 8][IMM8 as usize & 1],
+            [1, 9][(IMM8 >> 1) as usize & 1],
+            [2, 10][(IMM8 >> 2) as usize & 1],
+            [3, 11][(IMM8 >> 3) as usize & 1],
+            [4, 12][(IMM8 >> 4) as usize & 1],
+            [5, 13][(IMM8 >> 5) as usize & 1],
+            [6, 14][(IMM8 >> 6) as usize & 1],
+            [7, 15][(IMM8 >> 7) as usize & 1],
+        ]
+    ))
 }
 
 /// Blend packed double-precision (64-bit) floating-point elements from `a`
@@ -94,7 +105,8 @@ pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128
 #[cfg_attr(test, assert_instr(blendvpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
-    blendvpd(a, b, mask)
+    let mask: i64x2 = simd_lt(transmute::<_, i64x2>(mask), i64x2::splat(0));
+    transmute(simd_select(mask, b.as_f64x2(), a.as_f64x2()))
 }
 
 /// Blend packed single-precision (32-bit) floating-point elements from `a`
@@ -106,7 +118,8 @@ pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(blendvps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
-    blendvps(a, b, mask)
+    let mask: i32x4 = simd_lt(transmute::<_, i32x4>(mask), i32x4::splat(0));
+    transmute(simd_select(mask, b.as_f32x4(), a.as_f32x4()))
 }
 
 /// Blend packed double-precision (64-bit) floating-point elements from `a`
@@ -123,7 +136,11 @@ pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
     static_assert_uimm_bits!(IMM2, 2);
-    blendpd(a, b, IMM2 as u8)
+    transmute::<f64x2, _>(simd_shuffle!(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        [[0, 2][IMM2 as usize & 1], [1, 3][(IMM2 >> 1) as usize & 1]]
+    ))
 }
 
 /// Blend packed single-precision (32-bit) floating-point elements from `a`
@@ -137,7 +154,16 @@ pub unsafe fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
     static_assert_uimm_bits!(IMM4, 4);
-    blendps(a, b, IMM4 as u8)
+    transmute::<f32x4, _>(simd_shuffle!(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        [
+            [0, 4][IMM4 as usize & 1],
+            [1, 5][(IMM4 >> 1) as usize & 1],
+            [2, 6][(IMM4 >> 2) as usize & 1],
+            [3, 7][(IMM4 >> 3) as usize & 1],
+        ]
+    ))
 }
 
 /// Extracts a single-precision (32-bit) floating-point element from `a`,
@@ -175,7 +201,7 @@ pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
     static_assert_uimm_bits!(IMM8, 2);
-    transmute(simd_extract::<_, f32>(a, IMM8 as u32))
+    simd_extract::<_, f32>(a, IMM8 as u32).to_bits() as i32
 }
 
 /// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
@@ -923,7 +949,9 @@ pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(pmuldq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
-    transmute(pmuldq(a.as_i32x4(), b.as_i32x4()))
+    let a = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2()));
+    let b = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2()));
+    transmute(simd_mul(a, b))
 }
 
 /// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
@@ -1124,18 +1152,6 @@ pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
 
 #[allow(improper_ctypes)]
 extern "C" {
-    #[link_name = "llvm.x86.sse41.pblendvb"]
-    fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
-    #[link_name = "llvm.x86.sse41.blendvpd"]
-    fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d;
-    #[link_name = "llvm.x86.sse41.blendvps"]
-    fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128;
-    #[link_name = "llvm.x86.sse41.blendpd"]
-    fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d;
-    #[link_name = "llvm.x86.sse41.blendps"]
-    fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128;
-    #[link_name = "llvm.x86.sse41.pblendw"]
-    fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
     #[link_name = "llvm.x86.sse41.insertps"]
     fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
     #[link_name = "llvm.x86.sse41.packusdw"]
@@ -1154,8 +1170,6 @@ extern "C" {
     fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
     #[link_name = "llvm.x86.sse41.phminposuw"]
     fn phminposuw(a: u16x8) -> u16x8;
-    #[link_name = "llvm.x86.sse41.pmuldq"]
-    fn pmuldq(a: i32x4, b: i32x4) -> i64x2;
     #[link_name = "llvm.x86.sse41.mpsadbw"]
     fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
     #[link_name = "llvm.x86.sse41.ptestz"]
@@ -1245,9 +1259,9 @@ mod tests {
     #[simd_test(enable = "sse4.1")]
     unsafe fn test_mm_extract_ps() {
         let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
-        let r: f32 = transmute(_mm_extract_ps::<1>(a));
+        let r: f32 = f32::from_bits(_mm_extract_ps::<1>(a) as u32);
         assert_eq!(r, 1.0);
-        let r: f32 = transmute(_mm_extract_ps::<3>(a));
+        let r: f32 = f32::from_bits(_mm_extract_ps::<3>(a) as u32);
         assert_eq!(r, 3.0);
     }
 
@@ -1668,6 +1682,7 @@ mod tests {
         assert_eq_m128(r, e);
     }
 
+    #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions
     #[simd_test(enable = "sse4.1")]
     unsafe fn test_mm_round_sd() {
         let a = _mm_setr_pd(1.5, 3.5);
@@ -1680,6 +1695,7 @@ mod tests {
         assert_eq_m128d(r, e);
     }
 
+    #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions
     #[simd_test(enable = "sse4.1")]
     unsafe fn test_mm_round_ss() {
         let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
diff --git a/library/stdarch/crates/core_arch/src/x86/test.rs b/library/stdarch/crates/core_arch/src/x86/test.rs
index ec4298033..50b2d93be 100644
--- a/library/stdarch/crates/core_arch/src/x86/test.rs
+++ b/library/stdarch/crates/core_arch/src/x86/test.rs
@@ -3,11 +3,13 @@
 use crate::core_arch::x86::*;
 use std::mem::transmute;
 
+#[track_caller]
 #[target_feature(enable = "sse2")]
 pub unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
     assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
 }
 
+#[track_caller]
 #[target_feature(enable = "sse2")]
 pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
     if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
@@ -20,6 +22,7 @@ pub unsafe fn get_m128d(a: __m128d, idx: usize) -> f64 {
     transmute::<_, [f64; 2]>(a)[idx]
 }
 
+#[track_caller]
 #[target_feature(enable = "sse")]
 pub unsafe fn assert_eq_m128(a: __m128, b: __m128) {
     let r = _mm_cmpeq_ps(a, b);
@@ -40,11 +43,13 @@ pub unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
     _mm_set_epi64x(b, a)
 }
 
+#[track_caller]
 #[target_feature(enable = "avx")]
 pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
     assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b))
 }
 
+#[track_caller]
 #[target_feature(enable = "avx")]
 pub unsafe fn assert_eq_m256d(a: __m256d, b: __m256d) {
     let cmp = _mm256_cmp_pd::<_CMP_EQ_OQ>(a, b);
@@ -58,6 +63,7 @@ pub unsafe fn get_m256d(a: __m256d, idx: usize) -> f64 {
     transmute::<_, [f64; 4]>(a)[idx]
 }
 
+#[track_caller]
 #[target_feature(enable = "avx")]
 pub unsafe fn assert_eq_m256(a: __m256, b: __m256) {
     let cmp = _mm256_cmp_ps::<_CMP_EQ_OQ>(a, b);
@@ -125,10 +131,12 @@ mod x86_polyfill {
 }
 pub use self::x86_polyfill::*;
 
+#[track_caller]
 pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) {
     assert_eq!(transmute::<_, [i32; 16]>(a), transmute::<_, [i32; 16]>(b))
 }
 
+#[track_caller]
 pub unsafe fn assert_eq_m512(a: __m512, b: __m512) {
     let cmp = _mm512_cmp_ps_mask::<_CMP_EQ_OQ>(a, b);
     if cmp != 0b11111111_11111111 {
@@ -136,6 +144,7 @@ pub unsafe fn assert_eq_m512(a: __m512, b: __m512) {
     }
 }
 
+#[track_caller]
 pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) {
     let cmp = _mm512_cmp_pd_mask::<_CMP_EQ_OQ>(a, b);
     if cmp != 0b11111111 {
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-06-07 05:48:48 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-06-07 05:48:48 +0000
commit	ef24de24a82fe681581cc130f342363c47c0969a (patch)
tree	0d494f7e1a38b95c92426f58fe6eaa877303a86c /library/stdarch/crates/core_arch/src/x86
parent	Releasing progress-linux version 1.74.1+dfsg1-1~progress7.99u1. (diff)
download	rustc-ef24de24a82fe681581cc130f342363c47c0969a.tar.xz rustc-ef24de24a82fe681581cc130f342363c47c0969a.zip