summaryrefslogtreecommitdiffstats
path: root/library/stdarch/crates/core_arch/src/x86
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-07 05:48:48 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-07 05:48:48 +0000
commitef24de24a82fe681581cc130f342363c47c0969a (patch)
tree0d494f7e1a38b95c92426f58fe6eaa877303a86c /library/stdarch/crates/core_arch/src/x86
parentReleasing progress-linux version 1.74.1+dfsg1-1~progress7.99u1. (diff)
downloadrustc-ef24de24a82fe681581cc130f342363c47c0969a.tar.xz
rustc-ef24de24a82fe681581cc130f342363c47c0969a.zip
Merging upstream version 1.75.0+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'library/stdarch/crates/core_arch/src/x86')
-rw-r--r--library/stdarch/crates/core_arch/src/x86/avx.rs40
-rw-r--r--library/stdarch/crates/core_arch/src/x86/avx2.rs46
-rw-r--r--library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs12
-rw-r--r--library/stdarch/crates/core_arch/src/x86/avx512bw.rs104
-rw-r--r--library/stdarch/crates/core_arch/src/x86/avx512f.rs333
-rw-r--r--library/stdarch/crates/core_arch/src/x86/sse.rs186
-rw-r--r--library/stdarch/crates/core_arch/src/x86/sse2.rs309
-rw-r--r--library/stdarch/crates/core_arch/src/x86/sse3.rs18
-rw-r--r--library/stdarch/crates/core_arch/src/x86/sse41.rs72
-rw-r--r--library/stdarch/crates/core_arch/src/x86/test.rs9
10 files changed, 621 insertions, 508 deletions
diff --git a/library/stdarch/crates/core_arch/src/x86/avx.rs b/library/stdarch/crates/core_arch/src/x86/avx.rs
index 00bcc1fa1..de5dc05b8 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx.rs
@@ -268,7 +268,11 @@ pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
#[cfg_attr(test, assert_instr(vaddsubpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
- addsubpd256(a, b)
+ let a = a.as_f64x4();
+ let b = b.as_f64x4();
+ let add = simd_add(a, b);
+ let sub = simd_sub(a, b);
+ simd_shuffle!(add, sub, [4, 1, 6, 3])
}
/// Alternatively adds and subtracts packed single-precision (32-bit)
@@ -280,7 +284,11 @@ pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
#[cfg_attr(test, assert_instr(vaddsubps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
- addsubps256(a, b)
+ let a = a.as_f32x8();
+ let b = b.as_f32x8();
+ let add = simd_add(a, b);
+ let sub = simd_sub(a, b);
+ simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
}
/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
@@ -511,7 +519,8 @@ pub unsafe fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
#[cfg_attr(test, assert_instr(vblendvpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
- vblendvpd(a, b, c)
+ let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::splat(0));
+ transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
}
/// Blends packed single-precision (32-bit) floating-point elements from
@@ -523,7 +532,8 @@ pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
#[cfg_attr(test, assert_instr(vblendvps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
- vblendvps(a, b, c)
+ let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::splat(0));
+ transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
}
/// Conditionally multiplies the packed single-precision (32-bit) floating-point
@@ -2056,7 +2066,10 @@ pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
#[cfg_attr(test, assert_instr(vmovmskpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
- movmskpd256(a)
+ // Propagate the highest bit to the rest, because simd_bitmask
+ // requires all-1 or all-0.
+ let mask: i64x4 = simd_lt(transmute(a), i64x4::splat(0));
+ simd_bitmask::<i64x4, u8>(mask).into()
}
/// Sets each bit of the returned mask based on the most significant bit of the
@@ -2069,7 +2082,10 @@ pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
#[cfg_attr(test, assert_instr(vmovmskps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
- movmskps256(a)
+ // Propagate the highest bit to the rest, because simd_bitmask
+ // requires all-1 or all-0.
+ let mask: i32x8 = simd_lt(transmute(a), i32x8::splat(0));
+ simd_bitmask::<i32x8, u8>(mask).into()
}
/// Returns vector of type __m256d with all elements set to zero.
@@ -2904,20 +2920,12 @@ pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 {
// LLVM intrinsics used in the above functions
#[allow(improper_ctypes)]
extern "C" {
- #[link_name = "llvm.x86.avx.addsub.pd.256"]
- fn addsubpd256(a: __m256d, b: __m256d) -> __m256d;
- #[link_name = "llvm.x86.avx.addsub.ps.256"]
- fn addsubps256(a: __m256, b: __m256) -> __m256;
#[link_name = "llvm.x86.avx.round.pd.256"]
fn roundpd256(a: __m256d, b: i32) -> __m256d;
#[link_name = "llvm.x86.avx.round.ps.256"]
fn roundps256(a: __m256, b: i32) -> __m256;
#[link_name = "llvm.x86.avx.sqrt.ps.256"]
fn sqrtps256(a: __m256) -> __m256;
- #[link_name = "llvm.x86.avx.blendv.pd.256"]
- fn vblendvpd(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
- #[link_name = "llvm.x86.avx.blendv.ps.256"]
- fn vblendvps(a: __m256, b: __m256, c: __m256) -> __m256;
#[link_name = "llvm.x86.avx.dp.ps.256"]
fn vdpps(a: __m256, b: __m256, imm8: i32) -> __m256;
#[link_name = "llvm.x86.avx.hadd.pd.256"]
@@ -3026,10 +3034,6 @@ extern "C" {
fn vtestcps(a: __m128, b: __m128) -> i32;
#[link_name = "llvm.x86.avx.vtestnzc.ps"]
fn vtestnzcps(a: __m128, b: __m128) -> i32;
- #[link_name = "llvm.x86.avx.movmsk.pd.256"]
- fn movmskpd256(a: __m256d) -> i32;
- #[link_name = "llvm.x86.avx.movmsk.ps.256"]
- fn movmskps256(a: __m256) -> i32;
#[link_name = "llvm.x86.avx.min.ps.256"]
fn vminps(a: __m256, b: __m256) -> __m256;
#[link_name = "llvm.x86.avx.max.ps.256"]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs
index e23c795ee..243a4cdab 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs
@@ -344,7 +344,10 @@ pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpavgw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
- transmute(pavgw(a.as_u16x16(), b.as_u16x16()))
+ let a = simd_cast::<_, u32x16>(a.as_u16x16());
+ let b = simd_cast::<_, u32x16>(b.as_u16x16());
+ let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
+ transmute(simd_cast::<_, u16x16>(r))
}
/// Averages packed unsigned 8-bit integers in `a` and `b`.
@@ -355,7 +358,10 @@ pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpavgb))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
- transmute(pavgb(a.as_u8x32(), b.as_u8x32()))
+ let a = simd_cast::<_, u16x32>(a.as_u8x32());
+ let b = simd_cast::<_, u16x32>(b.as_u8x32());
+ let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
+ transmute(simd_cast::<_, u8x32>(r))
}
/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
@@ -458,7 +464,8 @@ pub unsafe fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
#[cfg_attr(test, assert_instr(vpblendvb))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
- transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32()))
+ let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::splat(0));
+ transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
}
/// Broadcasts the low packed 8-bit integer from `a` to all elements of
@@ -2060,7 +2067,9 @@ pub unsafe fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __
#[cfg_attr(test, assert_instr(vpmuldq))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
- transmute(pmuldq(a.as_i32x8(), b.as_i32x8()))
+ let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4()));
+ let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4()));
+ transmute(simd_mul(a, b))
}
/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
@@ -2074,7 +2083,10 @@ pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpmuludq))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
- transmute(pmuludq(a.as_u32x8(), b.as_u32x8()))
+ let a = a.as_u64x4();
+ let b = b.as_u64x4();
+ let mask = u64x4::splat(u32::MAX.into());
+ transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
}
/// Multiplies the packed 16-bit integers in `a` and `b`, producing
@@ -2087,7 +2099,10 @@ pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpmulhw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
- transmute(pmulhw(a.as_i16x16(), b.as_i16x16()))
+ let a = simd_cast::<_, i32x16>(a.as_i16x16());
+ let b = simd_cast::<_, i32x16>(b.as_i16x16());
+ let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
+ transmute(simd_cast::<i32x16, i16x16>(r))
}
/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
@@ -2100,7 +2115,10 @@ pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpmulhuw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
- transmute(pmulhuw(a.as_u16x16(), b.as_u16x16()))
+ let a = simd_cast::<_, u32x16>(a.as_u16x16());
+ let b = simd_cast::<_, u32x16>(b.as_u16x16());
+ let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
+ transmute(simd_cast::<u32x16, u16x16>(r))
}
/// Multiplies the packed 16-bit integers in `a` and `b`, producing
@@ -3629,12 +3647,6 @@ extern "C" {
fn pabsw(a: i16x16) -> u16x16;
#[link_name = "llvm.x86.avx2.pabs.d"]
fn pabsd(a: i32x8) -> u32x8;
- #[link_name = "llvm.x86.avx2.pavg.b"]
- fn pavgb(a: u8x32, b: u8x32) -> u8x32;
- #[link_name = "llvm.x86.avx2.pavg.w"]
- fn pavgw(a: u16x16, b: u16x16) -> u16x16;
- #[link_name = "llvm.x86.avx2.pblendvb"]
- fn pblendvb(a: i8x32, b: i8x32, mask: i8x32) -> i8x32;
#[link_name = "llvm.x86.avx2.phadd.w"]
fn phaddw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.phadd.d"]
@@ -3669,14 +3681,6 @@ extern "C" {
fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
#[link_name = "llvm.x86.avx2.mpsadbw"]
fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
- #[link_name = "llvm.x86.avx2.pmulhu.w"]
- fn pmulhuw(a: u16x16, b: u16x16) -> u16x16;
- #[link_name = "llvm.x86.avx2.pmulh.w"]
- fn pmulhw(a: i16x16, b: i16x16) -> i16x16;
- #[link_name = "llvm.x86.avx2.pmul.dq"]
- fn pmuldq(a: i32x8, b: i32x8) -> i64x4;
- #[link_name = "llvm.x86.avx2.pmulu.dq"]
- fn pmuludq(a: u32x8, b: u32x8) -> u64x4;
#[link_name = "llvm.x86.avx2.pmul.hr.sw"]
fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.packsswb"]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs b/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
index 92e572eb1..ce4e402a8 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
@@ -311,7 +311,7 @@ pub unsafe fn _mm_mask_popcnt_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __
#[target_feature(enable = "avx512bitalg")]
#[cfg_attr(test, assert_instr(vpshufbitqmb))]
pub unsafe fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64 {
- transmute(bitshuffle_512(b.as_i8x64(), c.as_i8x64(), !0))
+ bitshuffle_512(b.as_i8x64(), c.as_i8x64(), !0)
}
/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -326,7 +326,7 @@ pub unsafe fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64
#[target_feature(enable = "avx512bitalg")]
#[cfg_attr(test, assert_instr(vpshufbitqmb))]
pub unsafe fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m512i) -> __mmask64 {
- transmute(bitshuffle_512(b.as_i8x64(), c.as_i8x64(), k))
+ bitshuffle_512(b.as_i8x64(), c.as_i8x64(), k)
}
/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -338,7 +338,7 @@ pub unsafe fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m
#[target_feature(enable = "avx512bitalg,avx512vl")]
#[cfg_attr(test, assert_instr(vpshufbitqmb))]
pub unsafe fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32 {
- transmute(bitshuffle_256(b.as_i8x32(), c.as_i8x32(), !0))
+ bitshuffle_256(b.as_i8x32(), c.as_i8x32(), !0)
}
/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -353,7 +353,7 @@ pub unsafe fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32
#[target_feature(enable = "avx512bitalg,avx512vl")]
#[cfg_attr(test, assert_instr(vpshufbitqmb))]
pub unsafe fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m256i) -> __mmask32 {
- transmute(bitshuffle_256(b.as_i8x32(), c.as_i8x32(), k))
+ bitshuffle_256(b.as_i8x32(), c.as_i8x32(), k)
}
/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -365,7 +365,7 @@ pub unsafe fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m
#[target_feature(enable = "avx512bitalg,avx512vl")]
#[cfg_attr(test, assert_instr(vpshufbitqmb))]
pub unsafe fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
- transmute(bitshuffle_128(b.as_i8x16(), c.as_i8x16(), !0))
+ bitshuffle_128(b.as_i8x16(), c.as_i8x16(), !0)
}
/// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -380,7 +380,7 @@ pub unsafe fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
#[target_feature(enable = "avx512bitalg,avx512vl")]
#[cfg_attr(test, assert_instr(vpshufbitqmb))]
pub unsafe fn _mm_mask_bitshuffle_epi64_mask(k: __mmask16, b: __m128i, c: __m128i) -> __mmask16 {
- transmute(bitshuffle_128(b.as_i8x16(), c.as_i8x16(), k))
+ bitshuffle_128(b.as_i8x16(), c.as_i8x16(), k)
}
#[cfg(test)]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
index 364023539..0b4a56d36 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
@@ -3703,8 +3703,7 @@ pub unsafe fn _mm512_cmp_epu16_mask<const IMM8: i32>(a: __m512i, b: __m512i) ->
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u16x32();
let b = b.as_u16x32();
- let r = vpcmpuw(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
- transmute(r)
+ vpcmpuw(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
}
/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3722,8 +3721,7 @@ pub unsafe fn _mm512_mask_cmp_epu16_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u16x32();
let b = b.as_u16x32();
- let r = vpcmpuw(a, b, IMM8, k1);
- transmute(r)
+ vpcmpuw(a, b, IMM8, k1)
}
/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3737,8 +3735,7 @@ pub unsafe fn _mm256_cmp_epu16_mask<const IMM8: i32>(a: __m256i, b: __m256i) ->
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u16x16();
let b = b.as_u16x16();
- let r = vpcmpuw256(a, b, IMM8, 0b11111111_11111111);
- transmute(r)
+ vpcmpuw256(a, b, IMM8, 0b11111111_11111111)
}
/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3756,8 +3753,7 @@ pub unsafe fn _mm256_mask_cmp_epu16_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u16x16();
let b = b.as_u16x16();
- let r = vpcmpuw256(a, b, IMM8, k1);
- transmute(r)
+ vpcmpuw256(a, b, IMM8, k1)
}
/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3771,8 +3767,7 @@ pub unsafe fn _mm_cmp_epu16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __m
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u16x8();
let b = b.as_u16x8();
- let r = vpcmpuw128(a, b, IMM8, 0b11111111);
- transmute(r)
+ vpcmpuw128(a, b, IMM8, 0b11111111)
}
/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3790,8 +3785,7 @@ pub unsafe fn _mm_mask_cmp_epu16_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u16x8();
let b = b.as_u16x8();
- let r = vpcmpuw128(a, b, IMM8, k1);
- transmute(r)
+ vpcmpuw128(a, b, IMM8, k1)
}
/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3805,13 +3799,12 @@ pub unsafe fn _mm512_cmp_epu8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> _
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u8x64();
let b = b.as_u8x64();
- let r = vpcmpub(
+ vpcmpub(
a,
b,
IMM8,
0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
- );
- transmute(r)
+ )
}
/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3829,8 +3822,7 @@ pub unsafe fn _mm512_mask_cmp_epu8_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u8x64();
let b = b.as_u8x64();
- let r = vpcmpub(a, b, IMM8, k1);
- transmute(r)
+ vpcmpub(a, b, IMM8, k1)
}
/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3844,8 +3836,7 @@ pub unsafe fn _mm256_cmp_epu8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> _
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u8x32();
let b = b.as_u8x32();
- let r = vpcmpub256(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
- transmute(r)
+ vpcmpub256(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
}
/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3863,8 +3854,7 @@ pub unsafe fn _mm256_mask_cmp_epu8_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u8x32();
let b = b.as_u8x32();
- let r = vpcmpub256(a, b, IMM8, k1);
- transmute(r)
+ vpcmpub256(a, b, IMM8, k1)
}
/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3878,8 +3868,7 @@ pub unsafe fn _mm_cmp_epu8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mm
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u8x16();
let b = b.as_u8x16();
- let r = vpcmpub128(a, b, IMM8, 0b11111111_11111111);
- transmute(r)
+ vpcmpub128(a, b, IMM8, 0b11111111_11111111)
}
/// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3897,8 +3886,7 @@ pub unsafe fn _mm_mask_cmp_epu8_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_u8x16();
let b = b.as_u8x16();
- let r = vpcmpub128(a, b, IMM8, k1);
- transmute(r)
+ vpcmpub128(a, b, IMM8, k1)
}
/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3912,8 +3900,7 @@ pub unsafe fn _mm512_cmp_epi16_mask<const IMM8: i32>(a: __m512i, b: __m512i) ->
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i16x32();
let b = b.as_i16x32();
- let r = vpcmpw(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
- transmute(r)
+ vpcmpw(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
}
/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3931,8 +3918,7 @@ pub unsafe fn _mm512_mask_cmp_epi16_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i16x32();
let b = b.as_i16x32();
- let r = vpcmpw(a, b, IMM8, k1);
- transmute(r)
+ vpcmpw(a, b, IMM8, k1)
}
/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3946,8 +3932,7 @@ pub unsafe fn _mm256_cmp_epi16_mask<const IMM8: i32>(a: __m256i, b: __m256i) ->
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i16x16();
let b = b.as_i16x16();
- let r = vpcmpw256(a, b, IMM8, 0b11111111_11111111);
- transmute(r)
+ vpcmpw256(a, b, IMM8, 0b11111111_11111111)
}
/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3965,8 +3950,7 @@ pub unsafe fn _mm256_mask_cmp_epi16_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i16x16();
let b = b.as_i16x16();
- let r = vpcmpw256(a, b, IMM8, k1);
- transmute(r)
+ vpcmpw256(a, b, IMM8, k1)
}
/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3980,8 +3964,7 @@ pub unsafe fn _mm_cmp_epi16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __m
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i16x8();
let b = b.as_i16x8();
- let r = vpcmpw128(a, b, IMM8, 0b11111111);
- transmute(r)
+ vpcmpw128(a, b, IMM8, 0b11111111)
}
/// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3999,8 +3982,7 @@ pub unsafe fn _mm_mask_cmp_epi16_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i16x8();
let b = b.as_i16x8();
- let r = vpcmpw128(a, b, IMM8, k1);
- transmute(r)
+ vpcmpw128(a, b, IMM8, k1)
}
/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4014,13 +3996,12 @@ pub unsafe fn _mm512_cmp_epi8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> _
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i8x64();
let b = b.as_i8x64();
- let r = vpcmpb(
+ vpcmpb(
a,
b,
IMM8,
0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
- );
- transmute(r)
+ )
}
/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4038,8 +4019,7 @@ pub unsafe fn _mm512_mask_cmp_epi8_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i8x64();
let b = b.as_i8x64();
- let r = vpcmpb(a, b, IMM8, k1);
- transmute(r)
+ vpcmpb(a, b, IMM8, k1)
}
/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4053,8 +4033,7 @@ pub unsafe fn _mm256_cmp_epi8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> _
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i8x32();
let b = b.as_i8x32();
- let r = vpcmpb256(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
- transmute(r)
+ vpcmpb256(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
}
/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4072,8 +4051,7 @@ pub unsafe fn _mm256_mask_cmp_epi8_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i8x32();
let b = b.as_i8x32();
- let r = vpcmpb256(a, b, IMM8, k1);
- transmute(r)
+ vpcmpb256(a, b, IMM8, k1)
}
/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4087,8 +4065,7 @@ pub unsafe fn _mm_cmp_epi8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mm
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i8x16();
let b = b.as_i8x16();
- let r = vpcmpb128(a, b, IMM8, 0b11111111_11111111);
- transmute(r)
+ vpcmpb128(a, b, IMM8, 0b11111111_11111111)
}
/// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4106,8 +4083,7 @@ pub unsafe fn _mm_mask_cmp_epi8_mask<const IMM8: i32>(
static_assert_uimm_bits!(IMM8, 3);
let a = a.as_i8x16();
let b = b.as_i8x16();
- let r = vpcmpb128(a, b, IMM8, k1);
- transmute(r)
+ vpcmpb128(a, b, IMM8, k1)
}
/// Load 512-bits (composed of 32 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
@@ -8566,7 +8542,7 @@ pub unsafe fn _mm_movm_epi8(k: __mmask16) -> __m128i {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
- transmute(a + b)
+ a + b
}
/// Add 64-bit masks in a and b, and store the result in k.
@@ -8575,7 +8551,7 @@ pub unsafe fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
- transmute(a + b)
+ a + b
}
/// Compute the bitwise AND of 32-bit masks a and b, and store the result in k.
@@ -8584,7 +8560,7 @@ pub unsafe fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
- transmute(a & b)
+ a & b
}
/// Compute the bitwise AND of 64-bit masks a and b, and store the result in k.
@@ -8593,7 +8569,7 @@ pub unsafe fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
- transmute(a & b)
+ a & b
}
/// Compute the bitwise NOT of 32-bit mask a, and store the result in k.
@@ -8602,7 +8578,7 @@ pub unsafe fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _knot_mask32(a: __mmask32) -> __mmask32 {
- transmute(a ^ 0b11111111_11111111_11111111_11111111)
+ a ^ 0b11111111_11111111_11111111_11111111
}
/// Compute the bitwise NOT of 64-bit mask a, and store the result in k.
@@ -8611,7 +8587,7 @@ pub unsafe fn _knot_mask32(a: __mmask32) -> __mmask32 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _knot_mask64(a: __mmask64) -> __mmask64 {
- transmute(a ^ 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111)
+ a ^ 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
}
/// Compute the bitwise NOT of 32-bit masks a and then AND with b, and store the result in k.
@@ -8620,7 +8596,7 @@ pub unsafe fn _knot_mask64(a: __mmask64) -> __mmask64 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
- transmute(_knot_mask32(a) & b)
+ _knot_mask32(a) & b
}
/// Compute the bitwise NOT of 64-bit masks a and then AND with b, and store the result in k.
@@ -8629,7 +8605,7 @@ pub unsafe fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
- transmute(_knot_mask64(a) & b)
+ _knot_mask64(a) & b
}
/// Compute the bitwise OR of 32-bit masks a and b, and store the result in k.
@@ -8638,7 +8614,7 @@ pub unsafe fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
- transmute(a | b)
+ a | b
}
/// Compute the bitwise OR of 64-bit masks a and b, and store the result in k.
@@ -8647,7 +8623,7 @@ pub unsafe fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
- transmute(a | b)
+ a | b
}
/// Compute the bitwise XOR of 32-bit masks a and b, and store the result in k.
@@ -8656,7 +8632,7 @@ pub unsafe fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
- transmute(a ^ b)
+ a ^ b
}
/// Compute the bitwise XOR of 64-bit masks a and b, and store the result in k.
@@ -8665,7 +8641,7 @@ pub unsafe fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
- transmute(a ^ b)
+ a ^ b
}
/// Compute the bitwise XNOR of 32-bit masks a and b, and store the result in k.
@@ -8674,7 +8650,7 @@ pub unsafe fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
- transmute(_knot_mask32(a ^ b))
+ _knot_mask32(a ^ b)
}
/// Compute the bitwise XNOR of 64-bit masks a and b, and store the result in k.
@@ -8683,7 +8659,7 @@ pub unsafe fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
#[inline]
#[target_feature(enable = "avx512bw")]
pub unsafe fn _kxnor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
- transmute(_knot_mask64(a ^ b))
+ _knot_mask64(a ^ b)
}
/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
index 5412237ca..280135292 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
@@ -17144,7 +17144,7 @@ pub unsafe fn _mm512_slli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
if IMM8 >= 32 {
_mm512_setzero_si512()
} else {
- transmute(simd_shl(a.as_u32x16(), u32x16::splat(IMM8 as u32)))
+ transmute(simd_shl(a.as_u32x16(), u32x16::splat(IMM8)))
}
}
@@ -20132,7 +20132,7 @@ pub unsafe fn _mm512_maskz_permutexvar_epi32(k: __mmask16, idx: __m512i, a: __m5
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //should be vpermd
pub unsafe fn _mm256_permutexvar_epi32(idx: __m256i, a: __m256i) -> __m256i {
- transmute(_mm256_permutevar8x32_epi32(a, idx)) // llvm use llvm.x86.avx2.permd
+ _mm256_permutevar8x32_epi32(a, idx) // llvm use llvm.x86.avx2.permd
}
/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20284,7 +20284,7 @@ pub unsafe fn _mm512_maskz_permutexvar_ps(k: __mmask16, idx: __m512i, a: __m512)
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermps))]
pub unsafe fn _mm256_permutexvar_ps(idx: __m256i, a: __m256) -> __m256 {
- transmute(_mm256_permutevar8x32_ps(a, idx)) //llvm.x86.avx2.permps
+ _mm256_permutevar8x32_ps(a, idx) //llvm.x86.avx2.permps
}
/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23943,7 +23943,7 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(vmovd))]
pub unsafe fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
let extract: i32 = simd_extract(a.as_i32x16(), 0);
- transmute(extract)
+ extract
}
/// Broadcast the low packed 32-bit integer from a to all elements of dst.
@@ -25744,7 +25744,7 @@ pub unsafe fn _mm512_andnot_si512(a: __m512i, b: __m512i) -> __m512i {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
- transmute(a & b)
+ a & b
}
/// Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
@@ -25754,7 +25754,7 @@ pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 {
- transmute(a & b)
+ a & b
}
/// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
@@ -25764,7 +25764,7 @@ pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
- transmute(a | b)
+ a | b
}
/// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
@@ -25774,7 +25774,7 @@ pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 {
- transmute(a | b)
+ a | b
}
/// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
@@ -25784,7 +25784,7 @@ pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
- transmute(a ^ b)
+ a ^ b
}
/// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
@@ -25794,7 +25794,7 @@ pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
pub unsafe fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 {
- transmute(a ^ b)
+ a ^ b
}
/// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
@@ -25803,7 +25803,7 @@ pub unsafe fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _knot_mask16(a: __mmask16) -> __mmask16 {
- transmute(a ^ 0b11111111_11111111)
+ a ^ 0b11111111_11111111
}
/// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
@@ -25812,7 +25812,7 @@ pub unsafe fn _knot_mask16(a: __mmask16) -> __mmask16 {
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_knot(a: __mmask16) -> __mmask16 {
- transmute(a ^ 0b11111111_11111111)
+ a ^ 0b11111111_11111111
}
/// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k.
@@ -25862,8 +25862,7 @@ pub unsafe fn _mm512_kxnor(a: __mmask16, b: __mmask16) -> __mmask16 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 {
- let r: u16 = a;
- transmute(r)
+ a
}
/// Converts integer mask into bitmask, storing the result in dst.
@@ -25872,8 +25871,7 @@ pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 {
#[inline]
#[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
- let r: u16 = mask as u16;
- transmute(r)
+ mask as u16
}
/// Converts bit mask k1 into an integer value, storing the results in dst.
@@ -25883,8 +25881,7 @@ pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
- let r: i32 = k1 as i32;
- transmute(r)
+ k1 as i32
}
/// Unpack and interleave 8 bits from masks a and b, and store the 16-bit result in k.
@@ -25896,7 +25893,7 @@ pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
pub unsafe fn _mm512_kunpackb(a: __mmask16, b: __mmask16) -> __mmask16 {
let a = a & 0b00000000_11111111;
let b = b & 0b11111111_00000000;
- transmute(a | b)
+ a | b
}
/// Performs bitwise OR between k1 and k2, storing the result in dst. CF flag is set if dst consists of all 1's.
@@ -32352,8 +32349,7 @@ pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -
if (k & 0b00000001) != 0 {
mov = simd_extract(b, 0);
}
- let r = simd_insert(a, 0, mov);
- transmute(r)
+ simd_insert(a, 0, mov)
}
/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32367,8 +32363,7 @@ pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
if (k & 0b00000001) != 0 {
mov = simd_extract(b, 0);
}
- let r = simd_insert(a, 0, mov);
- transmute(r)
+ simd_insert(a, 0, mov)
}
/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32383,8 +32378,7 @@ pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d
if (k & 0b00000001) != 0 {
mov = simd_extract(b, 0);
}
- let r = simd_insert(a, 0, mov);
- transmute(r)
+ simd_insert(a, 0, mov)
}
/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32398,8 +32392,7 @@ pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d
if (k & 0b00000001) != 0 {
mov = simd_extract(b, 0);
}
- let r = simd_insert(a, 0, mov);
- transmute(r)
+ simd_insert(a, 0, mov)
}
/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32416,8 +32409,7 @@ pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
let extractb: f32 = simd_extract(b, 0);
add = extracta + extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32433,8 +32425,7 @@ pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
let extractb: f32 = simd_extract(b, 0);
add = extracta + extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32451,8 +32442,7 @@ pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
let extractb: f64 = simd_extract(b, 0);
add = extracta + extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32468,8 +32458,7 @@ pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
let extractb: f64 = simd_extract(b, 0);
add = extracta + extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32486,8 +32475,7 @@ pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
let extractb: f32 = simd_extract(b, 0);
add = extracta - extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32503,8 +32491,7 @@ pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
let extractb: f32 = simd_extract(b, 0);
add = extracta - extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32521,8 +32508,7 @@ pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
let extractb: f64 = simd_extract(b, 0);
add = extracta - extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32538,8 +32524,7 @@ pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
let extractb: f64 = simd_extract(b, 0);
add = extracta - extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32556,8 +32541,7 @@ pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
let extractb: f32 = simd_extract(b, 0);
add = extracta * extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32573,8 +32557,7 @@ pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
let extractb: f32 = simd_extract(b, 0);
add = extracta * extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32591,8 +32574,7 @@ pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
let extractb: f64 = simd_extract(b, 0);
add = extracta * extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32608,8 +32590,7 @@ pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
let extractb: f64 = simd_extract(b, 0);
add = extracta * extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32626,8 +32607,7 @@ pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
let extractb: f32 = simd_extract(b, 0);
add = extracta / extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -32643,8 +32623,7 @@ pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
let extractb: f32 = simd_extract(b, 0);
add = extracta / extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32661,8 +32640,7 @@ pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
let extractb: f64 = simd_extract(b, 0);
add = extracta / extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -32678,8 +32656,7 @@ pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
let extractb: f64 = simd_extract(b, 0);
add = extracta / extractb;
}
- let r = simd_insert(a, 0, add);
- transmute(r)
+ simd_insert(a, 0, add)
}
/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33587,8 +33564,7 @@ pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) ->
let extractc: f32 = simd_extract(c, 0);
fmadd = vfmadd132ss(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fmadd);
- transmute(r)
+ simd_insert(a, 0, fmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33605,8 +33581,7 @@ pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -
let extractc: f32 = simd_extract(c, 0);
fmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fmadd);
- transmute(r)
+ simd_insert(a, 0, fmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -33622,8 +33597,7 @@ pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -
let extractb: f32 = simd_extract(b, 0);
fmadd = vfmadd132ss(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(c, 0, fmadd);
- transmute(r)
+ simd_insert(c, 0, fmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33639,8 +33613,7 @@ pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d)
let extractc: f64 = simd_extract(c, 0);
fmadd = vfmadd132sd(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fmadd);
- transmute(r)
+ simd_insert(a, 0, fmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33657,8 +33630,7 @@ pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
let extractc: f64 = simd_extract(c, 0);
fmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fmadd);
- transmute(r)
+ simd_insert(a, 0, fmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -33674,8 +33646,7 @@ pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
let extractb: f64 = simd_extract(b, 0);
fmadd = vfmadd132sd(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(c, 0, fmadd);
- transmute(r)
+ simd_insert(c, 0, fmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33692,8 +33663,7 @@ pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) ->
let extractc = -extractc;
fmsub = vfmadd132ss(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33711,8 +33681,7 @@ pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -
let extractc = -extractc;
fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -33729,8 +33698,7 @@ pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -
let extractc = -fmsub;
fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(c, 0, fmsub);
- transmute(r)
+ simd_insert(c, 0, fmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33747,8 +33715,7 @@ pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d)
let extractc = -extractc;
fmsub = vfmadd132sd(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33766,8 +33733,7 @@ pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
let extractc = -extractc;
fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -33784,8 +33750,7 @@ pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
let extractc = -fmsub;
fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(c, 0, fmsub);
- transmute(r)
+ simd_insert(c, 0, fmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33802,8 +33767,7 @@ pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -
let extractc: f32 = simd_extract(c, 0);
fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33821,8 +33785,7 @@ pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128)
let extractc: f32 = simd_extract(c, 0);
fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -33839,8 +33802,7 @@ pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8)
let extractb: f32 = simd_extract(b, 0);
fnmadd = vfmadd132ss(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(c, 0, fnmadd);
- transmute(r)
+ simd_insert(c, 0, fnmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33857,8 +33819,7 @@ pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
let extractc: f64 = simd_extract(c, 0);
fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33876,8 +33837,7 @@ pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
let extractc: f64 = simd_extract(c, 0);
fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -33894,8 +33854,7 @@ pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
let extractb: f64 = simd_extract(b, 0);
fnmadd = vfmadd132sd(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(c, 0, fnmadd);
- transmute(r)
+ simd_insert(c, 0, fnmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33913,8 +33872,7 @@ pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -
let extractc = -extractc;
fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -33933,8 +33891,7 @@ pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128)
let extractc = -extractc;
fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -33952,8 +33909,7 @@ pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8)
let extractc = -fnmsub;
fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(c, 0, fnmsub);
- transmute(r)
+ simd_insert(c, 0, fnmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33971,8 +33927,7 @@ pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
let extractc = -extractc;
fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -33991,8 +33946,7 @@ pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
let extractc = -extractc;
fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -34010,8 +33964,7 @@ pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
let extractc = -fnmsub;
fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
}
- let r = simd_insert(c, 0, fnmsub);
- transmute(r)
+ simd_insert(c, 0, fnmsub)
}
/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35705,8 +35658,7 @@ pub unsafe fn _mm_fmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: _
let extractb: f32 = simd_extract(b, 0);
let extractc: f32 = simd_extract(c, 0);
let r = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
- let r = simd_insert(a, 0, r);
- transmute(r)
+ simd_insert(a, 0, r)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35736,8 +35688,7 @@ pub unsafe fn _mm_mask_fmadd_round_ss<const ROUNDING: i32>(
let extractc: f32 = simd_extract(c, 0);
fmadd = vfmadd132ss(fmadd, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fmadd);
- transmute(r)
+ simd_insert(a, 0, fmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35768,8 +35719,7 @@ pub unsafe fn _mm_maskz_fmadd_round_ss<const ROUNDING: i32>(
let extractc: f32 = simd_extract(c, 0);
fmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fmadd);
- transmute(r)
+ simd_insert(a, 0, fmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -35799,8 +35749,7 @@ pub unsafe fn _mm_mask3_fmadd_round_ss<const ROUNDING: i32>(
let extractb: f32 = simd_extract(b, 0);
fmadd = vfmadd132ss(extracta, extractb, fmadd, ROUNDING);
}
- let r = simd_insert(c, 0, fmadd);
- transmute(r)
+ simd_insert(c, 0, fmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -35827,8 +35776,7 @@ pub unsafe fn _mm_fmadd_round_sd<const ROUNDING: i32>(
let extractb: f64 = simd_extract(b, 0);
let extractc: f64 = simd_extract(c, 0);
let fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
- let r = simd_insert(a, 0, fmadd);
- transmute(r)
+ simd_insert(a, 0, fmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -35858,8 +35806,7 @@ pub unsafe fn _mm_mask_fmadd_round_sd<const ROUNDING: i32>(
let extractc: f64 = simd_extract(c, 0);
fmadd = vfmadd132sd(fmadd, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fmadd);
- transmute(r)
+ simd_insert(a, 0, fmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -35890,8 +35837,7 @@ pub unsafe fn _mm_maskz_fmadd_round_sd<const ROUNDING: i32>(
let extractc: f64 = simd_extract(c, 0);
fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fmadd);
- transmute(r)
+ simd_insert(a, 0, fmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -35921,8 +35867,7 @@ pub unsafe fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
let extractb: f64 = simd_extract(b, 0);
fmadd = vfmadd132sd(extracta, extractb, fmadd, ROUNDING);
}
- let r = simd_insert(c, 0, fmadd);
- transmute(r)
+ simd_insert(c, 0, fmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35946,8 +35891,7 @@ pub unsafe fn _mm_fmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: _
let extractc: f32 = simd_extract(c, 0);
let extractc = -extractc;
let fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35978,8 +35922,7 @@ pub unsafe fn _mm_mask_fmsub_round_ss<const ROUNDING: i32>(
let extractc = -extractc;
fmsub = vfmadd132ss(fmsub, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36011,8 +35954,7 @@ pub unsafe fn _mm_maskz_fmsub_round_ss<const ROUNDING: i32>(
let extractc = -extractc;
fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -36043,8 +35985,7 @@ pub unsafe fn _mm_mask3_fmsub_round_ss<const ROUNDING: i32>(
let extractc = -fmsub;
fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(c, 0, fmsub);
- transmute(r)
+ simd_insert(c, 0, fmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -36072,8 +36013,7 @@ pub unsafe fn _mm_fmsub_round_sd<const ROUNDING: i32>(
let extractc: f64 = simd_extract(c, 0);
let extractc = -extractc;
let fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36104,8 +36044,7 @@ pub unsafe fn _mm_mask_fmsub_round_sd<const ROUNDING: i32>(
let extractc = -extractc;
fmsub = vfmadd132sd(fmsub, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36137,8 +36076,7 @@ pub unsafe fn _mm_maskz_fmsub_round_sd<const ROUNDING: i32>(
let extractc = -extractc;
fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fmsub);
- transmute(r)
+ simd_insert(a, 0, fmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -36169,8 +36107,7 @@ pub unsafe fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
let extractc = -fmsub;
fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(c, 0, fmsub);
- transmute(r)
+ simd_insert(c, 0, fmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36194,8 +36131,7 @@ pub unsafe fn _mm_fnmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c:
let extractb: f32 = simd_extract(b, 0);
let extractc: f32 = simd_extract(c, 0);
let fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36226,8 +36162,7 @@ pub unsafe fn _mm_mask_fnmadd_round_ss<const ROUNDING: i32>(
let extractc: f32 = simd_extract(c, 0);
fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36259,8 +36194,7 @@ pub unsafe fn _mm_maskz_fnmadd_round_ss<const ROUNDING: i32>(
let extractc: f32 = simd_extract(c, 0);
fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -36291,8 +36225,7 @@ pub unsafe fn _mm_mask3_fnmadd_round_ss<const ROUNDING: i32>(
let extractb: f32 = simd_extract(b, 0);
fnmadd = vfmadd132ss(extracta, extractb, fnmadd, ROUNDING);
}
- let r = simd_insert(c, 0, fnmadd);
- transmute(r)
+ simd_insert(c, 0, fnmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -36320,8 +36253,7 @@ pub unsafe fn _mm_fnmadd_round_sd<const ROUNDING: i32>(
let extractb: f64 = simd_extract(b, 0);
let extractc: f64 = simd_extract(c, 0);
let fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36352,8 +36284,7 @@ pub unsafe fn _mm_mask_fnmadd_round_sd<const ROUNDING: i32>(
let extractc: f64 = simd_extract(c, 0);
fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36385,8 +36316,7 @@ pub unsafe fn _mm_maskz_fnmadd_round_sd<const ROUNDING: i32>(
let extractc: f64 = simd_extract(c, 0);
fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fnmadd);
- transmute(r)
+ simd_insert(a, 0, fnmadd)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -36417,8 +36347,7 @@ pub unsafe fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
let extractb: f64 = simd_extract(b, 0);
fnmadd = vfmadd132sd(extracta, extractb, fnmadd, ROUNDING);
}
- let r = simd_insert(c, 0, fnmadd);
- transmute(r)
+ simd_insert(c, 0, fnmadd)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36443,8 +36372,7 @@ pub unsafe fn _mm_fnmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c:
let extractc: f32 = simd_extract(c, 0);
let extractc = -extractc;
let fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36476,8 +36404,7 @@ pub unsafe fn _mm_mask_fnmsub_round_ss<const ROUNDING: i32>(
let extractc = -extractc;
fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36510,8 +36437,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_ss<const ROUNDING: i32>(
let extractc = -extractc;
fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -36543,8 +36469,7 @@ pub unsafe fn _mm_mask3_fnmsub_round_ss<const ROUNDING: i32>(
let extractc = -fnmsub;
fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(c, 0, fnmsub);
- transmute(r)
+ simd_insert(c, 0, fnmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -36573,8 +36498,7 @@ pub unsafe fn _mm_fnmsub_round_sd<const ROUNDING: i32>(
let extractc: f64 = simd_extract(c, 0);
let extractc = -extractc;
let fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36606,8 +36530,7 @@ pub unsafe fn _mm_mask_fnmsub_round_sd<const ROUNDING: i32>(
let extractc = -extractc;
fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36640,8 +36563,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_sd<const ROUNDING: i32>(
let extractc = -extractc;
fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(a, 0, fnmsub);
- transmute(r)
+ simd_insert(a, 0, fnmsub)
}
/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -36673,8 +36595,7 @@ pub unsafe fn _mm_mask3_fnmsub_round_sd<const ROUNDING: i32>(
let extractc = -fnmsub;
fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
}
- let r = simd_insert(c, 0, fnmsub);
- transmute(r)
+ simd_insert(c, 0, fnmsub)
}
/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
@@ -37168,8 +37089,7 @@ pub unsafe fn _mm_maskz_cvt_roundsd_ss<const ROUNDING: i32>(
pub unsafe fn _mm_cvt_roundss_si32<const ROUNDING: i32>(a: __m128) -> i32 {
static_assert_rounding!(ROUNDING);
let a = a.as_f32x4();
- let r = vcvtss2si(a, ROUNDING);
- transmute(r)
+ vcvtss2si(a, ROUNDING)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
@@ -37188,8 +37108,7 @@ pub unsafe fn _mm_cvt_roundss_si32<const ROUNDING: i32>(a: __m128) -> i32 {
pub unsafe fn _mm_cvt_roundss_i32<const ROUNDING: i32>(a: __m128) -> i32 {
static_assert_rounding!(ROUNDING);
let a = a.as_f32x4();
- let r = vcvtss2si(a, ROUNDING);
- transmute(r)
+ vcvtss2si(a, ROUNDING)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
@@ -37208,8 +37127,7 @@ pub unsafe fn _mm_cvt_roundss_i32<const ROUNDING: i32>(a: __m128) -> i32 {
pub unsafe fn _mm_cvt_roundss_u32<const ROUNDING: i32>(a: __m128) -> u32 {
static_assert_rounding!(ROUNDING);
let a = a.as_f32x4();
- let r = vcvtss2usi(a, ROUNDING);
- transmute(r)
+ vcvtss2usi(a, ROUNDING)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
@@ -37219,7 +37137,7 @@ pub unsafe fn _mm_cvt_roundss_u32<const ROUNDING: i32>(a: __m128) -> u32 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2si))]
pub unsafe fn _mm_cvtss_i32(a: __m128) -> i32 {
- transmute(vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+ vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
@@ -37229,7 +37147,7 @@ pub unsafe fn _mm_cvtss_i32(a: __m128) -> i32 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2usi))]
pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
- transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+ vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
@@ -37248,8 +37166,7 @@ pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
pub unsafe fn _mm_cvt_roundsd_si32<const ROUNDING: i32>(a: __m128d) -> i32 {
static_assert_rounding!(ROUNDING);
let a = a.as_f64x2();
- let r = vcvtsd2si(a, ROUNDING);
- transmute(r)
+ vcvtsd2si(a, ROUNDING)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
@@ -37268,8 +37185,7 @@ pub unsafe fn _mm_cvt_roundsd_si32<const ROUNDING: i32>(a: __m128d) -> i32 {
pub unsafe fn _mm_cvt_roundsd_i32<const ROUNDING: i32>(a: __m128d) -> i32 {
static_assert_rounding!(ROUNDING);
let a = a.as_f64x2();
- let r = vcvtsd2si(a, ROUNDING);
- transmute(r)
+ vcvtsd2si(a, ROUNDING)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
@@ -37288,8 +37204,7 @@ pub unsafe fn _mm_cvt_roundsd_i32<const ROUNDING: i32>(a: __m128d) -> i32 {
pub unsafe fn _mm_cvt_roundsd_u32<const ROUNDING: i32>(a: __m128d) -> u32 {
static_assert_rounding!(ROUNDING);
let a = a.as_f64x2();
- let r = vcvtsd2usi(a, ROUNDING);
- transmute(r)
+ vcvtsd2usi(a, ROUNDING)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
@@ -37299,7 +37214,7 @@ pub unsafe fn _mm_cvt_roundsd_u32<const ROUNDING: i32>(a: __m128d) -> u32 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2si))]
pub unsafe fn _mm_cvtsd_i32(a: __m128d) -> i32 {
- transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+ vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
@@ -37309,7 +37224,7 @@ pub unsafe fn _mm_cvtsd_i32(a: __m128d) -> i32 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2usi))]
pub unsafe fn _mm_cvtsd_u32(a: __m128d) -> u32 {
- transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+ vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -37382,8 +37297,7 @@ pub unsafe fn _mm_cvt_roundu32_ss<const ROUNDING: i32>(a: __m128, b: u32) -> __m
#[cfg_attr(test, assert_instr(vcvtsi2ss))]
pub unsafe fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
let b = b as f32;
- let r = simd_insert(a, 0, b);
- transmute(r)
+ simd_insert(a, 0, b)
}
/// Convert the signed 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -37394,8 +37308,7 @@ pub unsafe fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
#[cfg_attr(test, assert_instr(vcvtsi2sd))]
pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
let b = b as f64;
- let r = simd_insert(a, 0, b);
- transmute(r)
+ simd_insert(a, 0, b)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -37409,8 +37322,7 @@ pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
pub unsafe fn _mm_cvtt_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
static_assert_sae!(SAE);
let a = a.as_f32x4();
- let r = vcvtss2si(a, SAE);
- transmute(r)
+ vcvtss2si(a, SAE)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -37424,8 +37336,7 @@ pub unsafe fn _mm_cvtt_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
pub unsafe fn _mm_cvtt_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
static_assert_sae!(SAE);
let a = a.as_f32x4();
- let r = vcvtss2si(a, SAE);
- transmute(r)
+ vcvtss2si(a, SAE)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
@@ -37439,8 +37350,7 @@ pub unsafe fn _mm_cvtt_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
pub unsafe fn _mm_cvtt_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
static_assert_sae!(SAE);
let a = a.as_f32x4();
- let r = vcvtss2usi(a, SAE);
- transmute(r)
+ vcvtss2usi(a, SAE)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
@@ -37450,7 +37360,7 @@ pub unsafe fn _mm_cvtt_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2si))]
pub unsafe fn _mm_cvttss_i32(a: __m128) -> i32 {
- transmute(vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+ vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
@@ -37460,7 +37370,7 @@ pub unsafe fn _mm_cvttss_i32(a: __m128) -> i32 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2usi))]
pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 {
- transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+ vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -37474,8 +37384,7 @@ pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 {
pub unsafe fn _mm_cvtt_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
static_assert_sae!(SAE);
let a = a.as_f64x2();
- let r = vcvtsd2si(a, SAE);
- transmute(r)
+ vcvtsd2si(a, SAE)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -37489,8 +37398,7 @@ pub unsafe fn _mm_cvtt_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
pub unsafe fn _mm_cvtt_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
static_assert_sae!(SAE);
let a = a.as_f64x2();
- let r = vcvtsd2si(a, SAE);
- transmute(r)
+ vcvtsd2si(a, SAE)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
@@ -37504,8 +37412,7 @@ pub unsafe fn _mm_cvtt_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
pub unsafe fn _mm_cvtt_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
static_assert_sae!(SAE);
let a = a.as_f64x2();
- let r = vcvtsd2usi(a, SAE);
- transmute(r)
+ vcvtsd2usi(a, SAE)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
@@ -37515,7 +37422,7 @@ pub unsafe fn _mm_cvtt_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2si))]
pub unsafe fn _mm_cvttsd_i32(a: __m128d) -> i32 {
- transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+ vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
@@ -37525,7 +37432,7 @@ pub unsafe fn _mm_cvttsd_i32(a: __m128d) -> i32 {
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2usi))]
pub unsafe fn _mm_cvttsd_u32(a: __m128d) -> u32 {
- transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+ vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
}
/// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -37536,8 +37443,7 @@ pub unsafe fn _mm_cvttsd_u32(a: __m128d) -> u32 {
#[cfg_attr(test, assert_instr(vcvtusi2ss))]
pub unsafe fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
let b = b as f32;
- let r = simd_insert(a, 0, b);
- transmute(r)
+ simd_insert(a, 0, b)
}
/// Convert the unsigned 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -37548,8 +37454,7 @@ pub unsafe fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
#[cfg_attr(test, assert_instr(vcvtusi2sd))]
pub unsafe fn _mm_cvtu32_sd(a: __m128d, b: u32) -> __m128d {
let b = b as f64;
- let r = simd_insert(a, 0, b);
- transmute(r)
+ simd_insert(a, 0, b)
}
/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
@@ -37565,8 +37470,7 @@ pub unsafe fn _mm_comi_round_ss<const IMM5: i32, const SAE: i32>(a: __m128, b: _
static_assert_mantissas_sae!(SAE);
let a = a.as_f32x4();
let b = b.as_f32x4();
- let r = vcomiss(a, b, IMM5, SAE);
- transmute(r)
+ vcomiss(a, b, IMM5, SAE)
}
/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
@@ -37582,8 +37486,7 @@ pub unsafe fn _mm_comi_round_sd<const IMM5: i32, const SAE: i32>(a: __m128d, b:
static_assert_mantissas_sae!(SAE);
let a = a.as_f64x2();
let b = b.as_f64x2();
- let r = vcomisd(a, b, IMM5, SAE);
- transmute(r)
+ vcomisd(a, b, IMM5, SAE)
}
/// Equal
diff --git a/library/stdarch/crates/core_arch/src/x86/sse.rs b/library/stdarch/crates/core_arch/src/x86/sse.rs
index 3d4471ba3..6a2be0921 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse.rs
@@ -790,8 +790,7 @@ pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
///
/// The result is rounded according to the current rounding mode. If the result
/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
-/// (`i32::MIN`) or an invalid operation floating point exception if
-/// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
+/// (`i32::MIN`).
///
/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
///
@@ -821,8 +820,7 @@ pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
///
/// The result is rounded always using truncation (round towards zero). If the
/// result cannot be represented as a 32 bit integer the result will be
-/// `0x8000_0000` (`i32::MIN`) or an invalid operation floating point
-/// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
+/// `0x8000_0000` (`i32::MIN`).
///
/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
///
@@ -1083,7 +1081,10 @@ pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
#[cfg_attr(test, assert_instr(movmskps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
- movmskps(a)
+ // Propagate the highest bit to the rest, because simd_bitmask
+ // requires all-1 or all-0.
+ let mask: i32x4 = simd_lt(transmute(a), i32x4::splat(0));
+ simd_bitmask::<i32x4, u8>(mask).into()
}
/// Construct a `__m128` with the lowest element read from `p` and the other
@@ -1365,6 +1366,15 @@ pub unsafe fn _mm_sfence() {
/// Gets the unsigned 32-bit value of the MXCSR control and status register.
///
+/// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust
+/// floating-point operations may or may not result in this register getting updated with exception
+/// state, and the register can change between two invocations of this function even when no
+/// floating-point operations appear in the source code (since floating-point operations appearing
+/// earlier or later can be reordered).
+///
+/// If you need to perform some floating-point operations and check whether they raised an
+/// exception, use an inline assembly block for the entire sequence of operations.
+///
/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr)
@@ -1372,6 +1382,10 @@ pub unsafe fn _mm_sfence() {
#[target_feature(enable = "sse")]
#[cfg_attr(test, assert_instr(stmxcsr))]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _mm_getcsr() -> u32 {
let mut result = 0_i32;
stmxcsr(&mut result as *mut _ as *mut i8);
@@ -1401,6 +1415,16 @@ pub unsafe fn _mm_getcsr() -> u32 {
/// * The *denormals-are-zero mode flag* turns all numbers which would be
/// denormalized (exponent bits are all zeros) into zeros.
///
+/// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to
+/// **immediate Undefined Behavior**: Rust assumes that these are always in their default state and
+/// will optimize accordingly. This even applies when the register is altered and later reset to its
+/// original value without any floating-point operations appearing in the source code between those
+/// operations (since floating-point operations appearing earlier or later can be reordered).
+///
+/// If you need to perform some floating-point operations under a different masking flags, rounding
+/// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the
+/// original MXCSR register state before the end of the block.
+///
/// ## Exception Flags
///
/// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
@@ -1509,6 +1533,10 @@ pub unsafe fn _mm_getcsr() -> u32 {
#[target_feature(enable = "sse")]
#[cfg_attr(test, assert_instr(ldmxcsr))]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _mm_setcsr(val: u32) {
ldmxcsr(&val as *const _ as *const i8);
}
@@ -1588,9 +1616,14 @@ pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK)
#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
#[allow(non_snake_case)]
#[target_feature(enable = "sse")]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
_mm_getcsr() & _MM_MASK_MASK
}
@@ -1599,9 +1632,14 @@ pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE)
#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
#[allow(non_snake_case)]
#[target_feature(enable = "sse")]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
_mm_getcsr() & _MM_EXCEPT_MASK
}
@@ -1610,9 +1648,14 @@ pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE)
#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
#[allow(non_snake_case)]
#[target_feature(enable = "sse")]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
_mm_getcsr() & _MM_FLUSH_ZERO_MASK
}
@@ -1621,9 +1664,14 @@ pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE)
#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
#[allow(non_snake_case)]
#[target_feature(enable = "sse")]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_getcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
_mm_getcsr() & _MM_ROUND_MASK
}
@@ -1632,9 +1680,14 @@ pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK)
#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
#[allow(non_snake_case)]
#[target_feature(enable = "sse")]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
_mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x)
}
@@ -1643,9 +1696,14 @@ pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE)
#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
#[allow(non_snake_case)]
#[target_feature(enable = "sse")]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
_mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x)
}
@@ -1654,9 +1712,14 @@ pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE)
#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
#[allow(non_snake_case)]
#[target_feature(enable = "sse")]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x;
// println!("setting csr={:x}", val);
@@ -1667,9 +1730,14 @@ pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE)
#[inline]
+#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
#[allow(non_snake_case)]
#[target_feature(enable = "sse")]
#[stable(feature = "simd_x86", since = "1.27.0")]
+#[deprecated(
+ since = "1.75.0",
+ note = "see `_mm_setcsr` documentation - use inline assembly instead"
+)]
pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
_mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x)
}
@@ -1820,8 +1888,6 @@ extern "C" {
fn maxss(a: __m128, b: __m128) -> __m128;
#[link_name = "llvm.x86.sse.max.ps"]
fn maxps(a: __m128, b: __m128) -> __m128;
- #[link_name = "llvm.x86.sse.movmsk.ps"]
- fn movmskps(a: __m128) -> i32;
#[link_name = "llvm.x86.sse.cmp.ps"]
fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
#[link_name = "llvm.x86.sse.comieq.ss"]
@@ -1974,7 +2040,11 @@ mod tests {
let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
let r = _mm_rcp_ss(a);
let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
- assert_eq_m128(r, e);
+ let rel_err = 0.00048828125;
+ assert_approx_eq!(get_m128(r, 0), get_m128(e, 0), 2. * rel_err);
+ for i in 1..4 {
+ assert_eq!(get_m128(r, i), get_m128(e, i));
+ }
}
#[simd_test(enable = "sse")]
@@ -2055,6 +2125,17 @@ mod tests {
let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
let r = _mm_max_ps(a, b);
assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
+
+ // Check SSE-specific semantics for -0.0 handling.
+ let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
+ let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
+ let r1: [u8; 16] = transmute(_mm_max_ps(a, b));
+ let r2: [u8; 16] = transmute(_mm_max_ps(b, a));
+ let a: [u8; 16] = transmute(a);
+ let b: [u8; 16] = transmute(b);
+ assert_eq!(r1, b);
+ assert_eq!(r2, a);
+ assert_ne!(a, b); // sanity check that -0.0 is actually present
}
#[simd_test(enable = "sse")]
@@ -2098,12 +2179,12 @@ mod tests {
let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
- let e: u32x4 = transmute(_mm_setr_ps(transmute(0u32), 2.0, 3.0, 4.0));
+ let e: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0), 2.0, 3.0, 4.0));
assert_eq!(r, e);
let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
- let e2: u32x4 = transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0));
+ let e2: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0xffffffff), 2.0, 3.0, 4.0));
assert_eq!(r2, e2);
}
@@ -2119,15 +2200,15 @@ mod tests {
let d1 = !0u32; // a.extract(0) < d.extract(0)
let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2143,15 +2224,15 @@ mod tests {
let d1 = !0u32; // a.extract(0) <= d.extract(0)
let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2167,15 +2248,15 @@ mod tests {
let d1 = 0u32; // a.extract(0) > d.extract(0)
let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2191,15 +2272,15 @@ mod tests {
let d1 = 0u32; // a.extract(0) >= d.extract(0)
let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2215,15 +2296,15 @@ mod tests {
let d1 = !0u32; // a.extract(0) != d.extract(0)
let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2244,15 +2325,15 @@ mod tests {
let d1 = 0u32; // a.extract(0) >= d.extract(0)
let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2273,15 +2354,15 @@ mod tests {
let d1 = 0u32; // a.extract(0) > d.extract(0)
let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2302,15 +2383,15 @@ mod tests {
let d1 = !0u32; // a.extract(0) <= d.extract(0)
let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2331,15 +2412,15 @@ mod tests {
let d1 = !0u32; // a.extract(0) < d.extract(0)
let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2355,15 +2436,15 @@ mod tests {
let d1 = !0u32; // a.extract(0) ord d.extract(0)
let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2379,15 +2460,15 @@ mod tests {
let d1 = 0u32; // a.extract(0) unord d.extract(0)
let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
- let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
+ let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
assert_eq!(rb, eb);
let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
- let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
+ let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
assert_eq!(rc, ec);
let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
- let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
+ let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
assert_eq!(rd, ed);
}
@@ -2766,7 +2847,9 @@ mod tests {
}
}
+ #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions
#[simd_test(enable = "sse")]
+ #[cfg_attr(miri, ignore)] // Uses _mm_setcsr, which is not supported by Miri
unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() {
// If one of the arguments is a quiet NaN `comieq_ss` should signal an
// Invalid Operation Exception while `ucomieq_ss` should not.
@@ -3072,7 +3155,7 @@ mod tests {
let mut p = vals.as_mut_ptr();
if (p as usize) & 0xf != 0 {
- ofs = ((16 - (p as usize)) & 0xf) >> 2;
+ ofs = (16 - ((p as usize) & 0xf)) >> 2;
p = p.add(ofs);
}
@@ -3098,7 +3181,7 @@ mod tests {
// Align p to 16-byte boundary
if (p as usize) & 0xf != 0 {
- ofs = ((16 - (p as usize)) & 0xf) >> 2;
+ ofs = (16 - ((p as usize) & 0xf)) >> 2;
p = p.add(ofs);
}
@@ -3124,7 +3207,7 @@ mod tests {
// Align p to 16-byte boundary
if (p as usize) & 0xf != 0 {
- ofs = ((16 - (p as usize)) & 0xf) >> 2;
+ ofs = (16 - ((p as usize) & 0xf)) >> 2;
p = p.add(ofs);
}
@@ -3186,11 +3269,15 @@ mod tests {
}
#[simd_test(enable = "sse")]
+ // Miri cannot support this until it is clear how it fits in the Rust memory model
+ #[cfg_attr(miri, ignore)]
unsafe fn test_mm_sfence() {
_mm_sfence();
}
+ #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
#[simd_test(enable = "sse")]
+ #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
unsafe fn test_mm_getcsr_setcsr_1() {
let saved_csr = _mm_getcsr();
@@ -3206,7 +3293,9 @@ mod tests {
assert_eq_m128(r, exp); // first component is a denormalized f32
}
+ #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
#[simd_test(enable = "sse")]
+ #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
unsafe fn test_mm_getcsr_setcsr_2() {
// Same as _mm_setcsr_1 test, but with opposite flag value.
@@ -3224,7 +3313,9 @@ mod tests {
assert_eq_m128(r, exp); // first component is a denormalized f32
}
+ #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
#[simd_test(enable = "sse")]
+ #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
unsafe fn test_mm_getcsr_setcsr_underflow() {
_MM_SET_EXCEPTION_STATE(0);
@@ -3263,6 +3354,9 @@ mod tests {
}
#[simd_test(enable = "sse")]
+ // Miri cannot support this until it is clear how it fits in the Rust memory model
+ // (non-temporal store)
+ #[cfg_attr(miri, ignore)]
unsafe fn test_mm_stream_ps() {
let a = _mm_set1_ps(7.0);
let mut mem = Memory { data: [-1.0; 4] };
diff --git a/library/stdarch/crates/core_arch/src/x86/sse2.rs b/library/stdarch/crates/core_arch/src/x86/sse2.rs
index 3d572a1f5..7831ea743 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs
@@ -165,7 +165,10 @@ pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pavgb))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
- transmute(pavgb(a.as_u8x16(), b.as_u8x16()))
+ let a = simd_cast::<_, u16x16>(a.as_u8x16());
+ let b = simd_cast::<_, u16x16>(b.as_u8x16());
+ let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
+ transmute(simd_cast::<_, u8x16>(r))
}
/// Averages packed unsigned 16-bit integers in `a` and `b`.
@@ -176,7 +179,10 @@ pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pavgw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
- transmute(pavgw(a.as_u16x8(), b.as_u16x8()))
+ let a = simd_cast::<_, u32x8>(a.as_u16x8());
+ let b = simd_cast::<_, u32x8>(b.as_u16x8());
+ let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
+ transmute(simd_cast::<_, u16x8>(r))
}
/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
@@ -261,7 +267,10 @@ pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pmulhw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
- transmute(pmulhw(a.as_i16x8(), b.as_i16x8()))
+ let a = simd_cast::<_, i32x8>(a.as_i16x8());
+ let b = simd_cast::<_, i32x8>(b.as_i16x8());
+ let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
+ transmute(simd_cast::<i32x8, i16x8>(r))
}
/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
@@ -275,7 +284,10 @@ pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pmulhuw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
- transmute(pmulhuw(a.as_u16x8(), b.as_u16x8()))
+ let a = simd_cast::<_, u32x8>(a.as_u16x8());
+ let b = simd_cast::<_, u32x8>(b.as_u16x8());
+ let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
+ transmute(simd_cast::<u32x8, u16x8>(r))
}
/// Multiplies the packed 16-bit integers in `a` and `b`.
@@ -303,7 +315,10 @@ pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pmuludq))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
- transmute(pmuludq(a.as_u32x4(), b.as_u32x4()))
+ let a = a.as_u64x2();
+ let b = b.as_u64x2();
+ let mask = u64x2::splat(u32::MAX.into());
+ transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
}
/// Sum the absolute differences of packed unsigned 8-bit integers.
@@ -952,7 +967,7 @@ pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
#[cfg_attr(test, assert_instr(cvtdq2ps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
- cvtdq2ps(a.as_i32x4())
+ transmute(simd_cast::<_, f32x4>(a.as_i32x4()))
}
/// Converts packed single-precision (32-bit) floating-point elements in `a`
@@ -2240,7 +2255,9 @@ pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
#[cfg_attr(test, assert_instr(cvtpd2ps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
- cvtpd2ps(a)
+ let r = simd_cast::<_, f32x2>(a.as_f64x2());
+ let zero = f32x2::new(0.0, 0.0);
+ transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
}
/// Converts packed single-precision (32-bit) floating-point elements in `a` to
@@ -2253,7 +2270,8 @@ pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
#[cfg_attr(test, assert_instr(cvtps2pd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_cvtps_pd(a: __m128) -> __m128d {
- cvtps2pd(a)
+ let a = a.as_f32x4();
+ transmute(simd_cast::<f32x2, f64x2>(simd_shuffle!(a, a, [0, 1])))
}
/// Converts packed double-precision (64-bit) floating-point elements in `a` to
@@ -2432,7 +2450,10 @@ pub unsafe fn _mm_setzero_pd() -> __m128d {
#[cfg_attr(test, assert_instr(movmskpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 {
- movmskpd(a)
+ // Propagate the highest bit to the rest, because simd_bitmask
+ // requires all-1 or all-0.
+ let mask: i64x2 = simd_lt(transmute(a), i64x2::splat(0));
+ simd_bitmask::<i64x2, u8>(mask).into()
}
/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
@@ -2826,18 +2847,8 @@ extern "C" {
fn lfence();
#[link_name = "llvm.x86.sse2.mfence"]
fn mfence();
- #[link_name = "llvm.x86.sse2.pavg.b"]
- fn pavgb(a: u8x16, b: u8x16) -> u8x16;
- #[link_name = "llvm.x86.sse2.pavg.w"]
- fn pavgw(a: u16x8, b: u16x8) -> u16x8;
#[link_name = "llvm.x86.sse2.pmadd.wd"]
fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
- #[link_name = "llvm.x86.sse2.pmulh.w"]
- fn pmulhw(a: i16x8, b: i16x8) -> i16x8;
- #[link_name = "llvm.x86.sse2.pmulhu.w"]
- fn pmulhuw(a: u16x8, b: u16x8) -> u16x8;
- #[link_name = "llvm.x86.sse2.pmulu.dq"]
- fn pmuludq(a: u32x4, b: u32x4) -> u64x2;
#[link_name = "llvm.x86.sse2.psad.bw"]
fn psadbw(a: u8x16, b: u8x16) -> u64x2;
#[link_name = "llvm.x86.sse2.psll.w"]
@@ -2856,8 +2867,6 @@ extern "C" {
fn psrld(a: i32x4, count: i32x4) -> i32x4;
#[link_name = "llvm.x86.sse2.psrl.q"]
fn psrlq(a: i64x2, count: i64x2) -> i64x2;
- #[link_name = "llvm.x86.sse2.cvtdq2ps"]
- fn cvtdq2ps(a: i32x4) -> __m128;
#[link_name = "llvm.x86.sse2.cvtps2dq"]
fn cvtps2dq(a: __m128) -> i32x4;
#[link_name = "llvm.x86.sse2.maskmov.dqu"]
@@ -2908,12 +2917,6 @@ extern "C" {
fn ucomigesd(a: __m128d, b: __m128d) -> i32;
#[link_name = "llvm.x86.sse2.ucomineq.sd"]
fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
- #[link_name = "llvm.x86.sse2.movmsk.pd"]
- fn movmskpd(a: __m128d) -> i32;
- #[link_name = "llvm.x86.sse2.cvtpd2ps"]
- fn cvtpd2ps(a: __m128d) -> __m128;
- #[link_name = "llvm.x86.sse2.cvtps2pd"]
- fn cvtps2pd(a: __m128) -> __m128d;
#[link_name = "llvm.x86.sse2.cvtpd2dq"]
fn cvtpd2dq(a: __m128d) -> i32x4;
#[link_name = "llvm.x86.sse2.cvtsd2si"]
@@ -2956,11 +2959,15 @@ mod tests {
}
#[simd_test(enable = "sse2")]
+ // Miri cannot support this until it is clear how it fits in the Rust memory model
+ #[cfg_attr(miri, ignore)]
unsafe fn test_mm_lfence() {
_mm_lfence();
}
#[simd_test(enable = "sse2")]
+ // Miri cannot support this until it is clear how it fits in the Rust memory model
+ #[cfg_attr(miri, ignore)]
unsafe fn test_mm_mfence() {
_mm_mfence();
}
@@ -3343,83 +3350,124 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_slli_epi16() {
- #[rustfmt::skip]
- let a = _mm_setr_epi16(
- 0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0,
- );
+ let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
let r = _mm_slli_epi16::<4>(a);
-
- #[rustfmt::skip]
- let e = _mm_setr_epi16(
- 0xFFF0 as u16 as i16, 0xFFF0 as u16 as i16, 0x0FF0, 0x00F0,
- 0, 0, 0, 0,
+ assert_eq_m128i(
+ r,
+ _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
);
- assert_eq_m128i(r, e);
+ let r = _mm_slli_epi16::<16>(a);
+ assert_eq_m128i(r, _mm_set1_epi16(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_sll_epi16() {
- let a = _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0);
- let r = _mm_sll_epi16(a, _mm_setr_epi16(4, 0, 0, 0, 0, 0, 0, 0));
- assert_eq_m128i(r, _mm_setr_epi16(0xFF0, 0, 0, 0, 0, 0, 0, 0));
- let r = _mm_sll_epi16(a, _mm_setr_epi16(0, 0, 0, 0, 4, 0, 0, 0));
- assert_eq_m128i(r, _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0));
+ let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+ let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
+ assert_eq_m128i(
+ r,
+ _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
+ );
+ let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
+ assert_eq_m128i(r, a);
+ let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
+ assert_eq_m128i(r, _mm_set1_epi16(0));
+ let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
+ assert_eq_m128i(r, _mm_set1_epi16(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_slli_epi32() {
- let r = _mm_slli_epi32::<4>(_mm_set1_epi32(0xFFFF));
- assert_eq_m128i(r, _mm_set1_epi32(0xFFFF0));
+ let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+ let r = _mm_slli_epi32::<4>(a);
+ assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
+ let r = _mm_slli_epi32::<32>(a);
+ assert_eq_m128i(r, _mm_set1_epi32(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_sll_epi32() {
- let a = _mm_set1_epi32(0xFFFF);
- let b = _mm_setr_epi32(4, 0, 0, 0);
- let r = _mm_sll_epi32(a, b);
- assert_eq_m128i(r, _mm_set1_epi32(0xFFFF0));
+ let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+ let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
+ assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
+ let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
+ assert_eq_m128i(r, a);
+ let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
+ assert_eq_m128i(r, _mm_set1_epi32(0));
+ let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
+ assert_eq_m128i(r, _mm_set1_epi32(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_slli_epi64() {
- let r = _mm_slli_epi64::<4>(_mm_set1_epi64x(0xFFFFFFFF));
- assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFFF0));
+ let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+ let r = _mm_slli_epi64::<4>(a);
+ assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
+ let r = _mm_slli_epi64::<64>(a);
+ assert_eq_m128i(r, _mm_set1_epi64x(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_sll_epi64() {
- let a = _mm_set1_epi64x(0xFFFFFFFF);
- let b = _mm_setr_epi64x(4, 0);
- let r = _mm_sll_epi64(a, b);
- assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFFF0));
+ let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+ let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
+ assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
+ let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
+ assert_eq_m128i(r, a);
+ let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
+ assert_eq_m128i(r, _mm_set1_epi64x(0));
+ let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
+ assert_eq_m128i(r, _mm_set1_epi64x(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_srai_epi16() {
- let r = _mm_srai_epi16::<1>(_mm_set1_epi16(-1));
- assert_eq_m128i(r, _mm_set1_epi16(-1));
+ let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+ let r = _mm_srai_epi16::<4>(a);
+ assert_eq_m128i(
+ r,
+ _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
+ );
+ let r = _mm_srai_epi16::<16>(a);
+ assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_sra_epi16() {
- let a = _mm_set1_epi16(-1);
- let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
- let r = _mm_sra_epi16(a, b);
- assert_eq_m128i(r, _mm_set1_epi16(-1));
+ let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+ let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
+ assert_eq_m128i(
+ r,
+ _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
+ );
+ let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
+ assert_eq_m128i(r, a);
+ let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
+ assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
+ let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
+ assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_srai_epi32() {
- let r = _mm_srai_epi32::<1>(_mm_set1_epi32(-1));
- assert_eq_m128i(r, _mm_set1_epi32(-1));
+ let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+ let r = _mm_srai_epi32::<4>(a);
+ assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
+ let r = _mm_srai_epi32::<32>(a);
+ assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_sra_epi32() {
- let a = _mm_set1_epi32(-1);
- let b = _mm_setr_epi32(1, 0, 0, 0);
- let r = _mm_sra_epi32(a, b);
- assert_eq_m128i(r, _mm_set1_epi32(-1));
+ let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+ let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
+ assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
+ let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
+ assert_eq_m128i(r, a);
+ let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
+ assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
+ let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
+ assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
}
#[simd_test(enable = "sse2")]
@@ -3453,53 +3501,74 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_srli_epi16() {
- #[rustfmt::skip]
- let a = _mm_setr_epi16(
- 0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0,
- );
+ let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
let r = _mm_srli_epi16::<4>(a);
- #[rustfmt::skip]
- let e = _mm_setr_epi16(
- 0xFFF as u16 as i16, 0xFF as u16 as i16, 0xF, 0, 0, 0, 0, 0,
+ assert_eq_m128i(
+ r,
+ _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
);
- assert_eq_m128i(r, e);
+ let r = _mm_srli_epi16::<16>(a);
+ assert_eq_m128i(r, _mm_set1_epi16(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_srl_epi16() {
- let a = _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0);
- let r = _mm_srl_epi16(a, _mm_setr_epi16(4, 0, 0, 0, 0, 0, 0, 0));
- assert_eq_m128i(r, _mm_setr_epi16(0xF, 0, 0, 0, 0, 0, 0, 0));
- let r = _mm_srl_epi16(a, _mm_setr_epi16(0, 0, 0, 0, 4, 0, 0, 0));
- assert_eq_m128i(r, _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0));
+ let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
+ let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
+ assert_eq_m128i(
+ r,
+ _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
+ );
+ let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
+ assert_eq_m128i(r, a);
+ let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
+ assert_eq_m128i(r, _mm_set1_epi16(0));
+ let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
+ assert_eq_m128i(r, _mm_set1_epi16(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_srli_epi32() {
- let r = _mm_srli_epi32::<4>(_mm_set1_epi32(0xFFFF));
- assert_eq_m128i(r, _mm_set1_epi32(0xFFF));
+ let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+ let r = _mm_srli_epi32::<4>(a);
+ assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
+ let r = _mm_srli_epi32::<32>(a);
+ assert_eq_m128i(r, _mm_set1_epi32(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_srl_epi32() {
- let a = _mm_set1_epi32(0xFFFF);
- let b = _mm_setr_epi32(4, 0, 0, 0);
- let r = _mm_srl_epi32(a, b);
- assert_eq_m128i(r, _mm_set1_epi32(0xFFF));
+ let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+ let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
+ assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
+ let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
+ assert_eq_m128i(r, a);
+ let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
+ assert_eq_m128i(r, _mm_set1_epi32(0));
+ let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
+ assert_eq_m128i(r, _mm_set1_epi32(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_srli_epi64() {
- let r = _mm_srli_epi64::<4>(_mm_set1_epi64x(0xFFFFFFFF));
- assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFF));
+ let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+ let r = _mm_srli_epi64::<4>(a);
+ assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
+ let r = _mm_srli_epi64::<64>(a);
+ assert_eq_m128i(r, _mm_set1_epi64x(0));
}
#[simd_test(enable = "sse2")]
unsafe fn test_mm_srl_epi64() {
- let a = _mm_set1_epi64x(0xFFFFFFFF);
- let b = _mm_setr_epi64x(4, 0);
- let r = _mm_srl_epi64(a, b);
- assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFF));
+ let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
+ let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
+ assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
+ let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
+ assert_eq_m128i(r, a);
+ let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
+ assert_eq_m128i(r, _mm_set1_epi64x(0));
+ let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
+ assert_eq_m128i(r, _mm_set1_epi64x(0));
}
#[simd_test(enable = "sse2")]
@@ -3766,6 +3835,9 @@ mod tests {
}
#[simd_test(enable = "sse2")]
+ // Miri cannot support this until it is clear how it fits in the Rust memory model
+ // (non-temporal store)
+ #[cfg_attr(miri, ignore)]
unsafe fn test_mm_maskmoveu_si128() {
let a = _mm_set1_epi8(9);
#[rustfmt::skip]
@@ -3804,6 +3876,9 @@ mod tests {
}
#[simd_test(enable = "sse2")]
+ // Miri cannot support this until it is clear how it fits in the Rust memory model
+ // (non-temporal store)
+ #[cfg_attr(miri, ignore)]
unsafe fn test_mm_stream_si128() {
let a = _mm_setr_epi32(1, 2, 3, 4);
let mut r = _mm_undefined_si128();
@@ -3812,6 +3887,9 @@ mod tests {
}
#[simd_test(enable = "sse2")]
+ // Miri cannot support this until it is clear how it fits in the Rust memory model
+ // (non-temporal store)
+ #[cfg_attr(miri, ignore)]
unsafe fn test_mm_stream_si32() {
let a: i32 = 7;
let mut mem = boxed::Box::<i32>::new(-1);
@@ -4055,6 +4133,17 @@ mod tests {
let b = _mm_setr_pd(5.0, 10.0);
let r = _mm_max_pd(a, b);
assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
+
+ // Check SSE(2)-specific semantics for -0.0 handling.
+ let a = _mm_setr_pd(-0.0, 0.0);
+ let b = _mm_setr_pd(0.0, 0.0);
+ let r1: [u8; 16] = transmute(_mm_max_pd(a, b));
+ let r2: [u8; 16] = transmute(_mm_max_pd(b, a));
+ let a: [u8; 16] = transmute(a);
+ let b: [u8; 16] = transmute(b);
+ assert_eq!(r1, b);
+ assert_eq!(r2, a);
+ assert_ne!(a, b); // sanity check that -0.0 is actually present
}
#[simd_test(enable = "sse2")]
@@ -4071,6 +4160,17 @@ mod tests {
let b = _mm_setr_pd(5.0, 10.0);
let r = _mm_min_pd(a, b);
assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
+
+ // Check SSE(2)-specific semantics for -0.0 handling.
+ let a = _mm_setr_pd(-0.0, 0.0);
+ let b = _mm_setr_pd(0.0, 0.0);
+ let r1: [u8; 16] = transmute(_mm_min_pd(a, b));
+ let r2: [u8; 16] = transmute(_mm_min_pd(b, a));
+ let a: [u8; 16] = transmute(a);
+ let b: [u8; 16] = transmute(b);
+ assert_eq!(r1, b);
+ assert_eq!(r2, a);
+ assert_ne!(a, b); // sanity check that -0.0 is actually present
}
#[simd_test(enable = "sse2")]
@@ -4158,7 +4258,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpeq_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
- let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4166,7 +4266,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmplt_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
- let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4174,7 +4274,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmple_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
- let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4182,7 +4282,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpgt_sd() {
let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
- let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4190,7 +4290,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpge_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
- let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4198,7 +4298,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpord_sd() {
let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
- let e = _mm_setr_epi64x(0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4206,7 +4306,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpunord_sd() {
let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
- let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4214,7 +4314,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpneq_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
- let e = _mm_setr_epi64x(!0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4222,7 +4322,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpnlt_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
- let e = _mm_setr_epi64x(0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4230,7 +4330,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpnle_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
- let e = _mm_setr_epi64x(0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4238,7 +4338,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpngt_sd() {
let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
- let e = _mm_setr_epi64x(0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4246,7 +4346,7 @@ mod tests {
#[simd_test(enable = "sse2")]
unsafe fn test_mm_cmpnge_sd() {
let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
- let e = _mm_setr_epi64x(0, transmute(2.0f64));
+ let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b));
assert_eq_m128i(r, e);
}
@@ -4478,6 +4578,9 @@ mod tests {
}
#[simd_test(enable = "sse2")]
+ // Miri cannot support this until it is clear how it fits in the Rust memory model
+ // (non-temporal store)
+ #[cfg_attr(miri, ignore)]
unsafe fn test_mm_stream_pd() {
#[repr(align(128))]
struct Memory {
diff --git a/library/stdarch/crates/core_arch/src/x86/sse3.rs b/library/stdarch/crates/core_arch/src/x86/sse3.rs
index 092a8d9cd..df0d78e5b 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse3.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse3.rs
@@ -1,7 +1,7 @@
//! Streaming SIMD Extensions 3 (SSE3)
use crate::{
- core_arch::{simd::*, simd_llvm::simd_shuffle, x86::*},
+ core_arch::{simd::*, simd_llvm::*, x86::*},
mem::transmute,
};
@@ -17,7 +17,11 @@ use stdarch_test::assert_instr;
#[cfg_attr(test, assert_instr(addsubps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
- addsubps(a, b)
+ let a = a.as_f32x4();
+ let b = b.as_f32x4();
+ let add = simd_add(a, b);
+ let sub = simd_sub(a, b);
+ simd_shuffle!(add, sub, [4, 1, 6, 3])
}
/// Alternatively add and subtract packed double-precision (64-bit)
@@ -29,7 +33,11 @@ pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
#[cfg_attr(test, assert_instr(addsubpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
- addsubpd(a, b)
+ let a = a.as_f64x2();
+ let b = b.as_f64x2();
+ let add = simd_add(a, b);
+ let sub = simd_sub(a, b);
+ simd_shuffle!(add, sub, [2, 1])
}
/// Horizontally adds adjacent pairs of double-precision (64-bit)
@@ -143,10 +151,6 @@ pub unsafe fn _mm_moveldup_ps(a: __m128) -> __m128 {
#[allow(improper_ctypes)]
extern "C" {
- #[link_name = "llvm.x86.sse3.addsub.ps"]
- fn addsubps(a: __m128, b: __m128) -> __m128;
- #[link_name = "llvm.x86.sse3.addsub.pd"]
- fn addsubpd(a: __m128d, b: __m128d) -> __m128d;
#[link_name = "llvm.x86.sse3.hadd.pd"]
fn haddpd(a: __m128d, b: __m128d) -> __m128d;
#[link_name = "llvm.x86.sse3.hadd.ps"]
diff --git a/library/stdarch/crates/core_arch/src/x86/sse41.rs b/library/stdarch/crates/core_arch/src/x86/sse41.rs
index 7ba86e5f7..6d33238b0 100644
--- a/library/stdarch/crates/core_arch/src/x86/sse41.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse41.rs
@@ -62,7 +62,8 @@ pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTI
#[cfg_attr(test, assert_instr(pblendvb))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
- transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
+ let mask: i8x16 = simd_lt(mask.as_i8x16(), i8x16::splat(0));
+ transmute(simd_select(mask, b.as_i8x16(), a.as_i8x16()))
}
/// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
@@ -74,15 +75,25 @@ pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16)
#[inline]
#[target_feature(enable = "sse4.1")]
-// Note: LLVM7 prefers the single-precision floating-point domain when possible
-// see https://bugs.llvm.org/show_bug.cgi?id=38195
-// #[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xF0))]
-#[cfg_attr(test, assert_instr(blendps, IMM8 = 0xF0))]
+#[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xB1))]
#[rustc_legacy_const_generics(2)]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
static_assert_uimm_bits!(IMM8, 8);
- transmute(pblendw(a.as_i16x8(), b.as_i16x8(), IMM8 as u8))
+ transmute::<i16x8, _>(simd_shuffle!(
+ a.as_i16x8(),
+ b.as_i16x8(),
+ [
+ [0, 8][IMM8 as usize & 1],
+ [1, 9][(IMM8 >> 1) as usize & 1],
+ [2, 10][(IMM8 >> 2) as usize & 1],
+ [3, 11][(IMM8 >> 3) as usize & 1],
+ [4, 12][(IMM8 >> 4) as usize & 1],
+ [5, 13][(IMM8 >> 5) as usize & 1],
+ [6, 14][(IMM8 >> 6) as usize & 1],
+ [7, 15][(IMM8 >> 7) as usize & 1],
+ ]
+ ))
}
/// Blend packed double-precision (64-bit) floating-point elements from `a`
@@ -94,7 +105,8 @@ pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128
#[cfg_attr(test, assert_instr(blendvpd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
- blendvpd(a, b, mask)
+ let mask: i64x2 = simd_lt(transmute::<_, i64x2>(mask), i64x2::splat(0));
+ transmute(simd_select(mask, b.as_f64x2(), a.as_f64x2()))
}
/// Blend packed single-precision (32-bit) floating-point elements from `a`
@@ -106,7 +118,8 @@ pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
#[cfg_attr(test, assert_instr(blendvps))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
- blendvps(a, b, mask)
+ let mask: i32x4 = simd_lt(transmute::<_, i32x4>(mask), i32x4::splat(0));
+ transmute(simd_select(mask, b.as_f32x4(), a.as_f32x4()))
}
/// Blend packed double-precision (64-bit) floating-point elements from `a`
@@ -123,7 +136,11 @@ pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
static_assert_uimm_bits!(IMM2, 2);
- blendpd(a, b, IMM2 as u8)
+ transmute::<f64x2, _>(simd_shuffle!(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ [[0, 2][IMM2 as usize & 1], [1, 3][(IMM2 >> 1) as usize & 1]]
+ ))
}
/// Blend packed single-precision (32-bit) floating-point elements from `a`
@@ -137,7 +154,16 @@ pub unsafe fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
static_assert_uimm_bits!(IMM4, 4);
- blendps(a, b, IMM4 as u8)
+ transmute::<f32x4, _>(simd_shuffle!(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ [
+ [0, 4][IMM4 as usize & 1],
+ [1, 5][(IMM4 >> 1) as usize & 1],
+ [2, 6][(IMM4 >> 2) as usize & 1],
+ [3, 7][(IMM4 >> 3) as usize & 1],
+ ]
+ ))
}
/// Extracts a single-precision (32-bit) floating-point element from `a`,
@@ -175,7 +201,7 @@ pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
static_assert_uimm_bits!(IMM8, 2);
- transmute(simd_extract::<_, f32>(a, IMM8 as u32))
+ simd_extract::<_, f32>(a, IMM8 as u32).to_bits() as i32
}
/// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
@@ -923,7 +949,9 @@ pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
#[cfg_attr(test, assert_instr(pmuldq))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
- transmute(pmuldq(a.as_i32x4(), b.as_i32x4()))
+ let a = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2()));
+ let b = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2()));
+ transmute(simd_mul(a, b))
}
/// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
@@ -1124,18 +1152,6 @@ pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
#[allow(improper_ctypes)]
extern "C" {
- #[link_name = "llvm.x86.sse41.pblendvb"]
- fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
- #[link_name = "llvm.x86.sse41.blendvpd"]
- fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d;
- #[link_name = "llvm.x86.sse41.blendvps"]
- fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128;
- #[link_name = "llvm.x86.sse41.blendpd"]
- fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d;
- #[link_name = "llvm.x86.sse41.blendps"]
- fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128;
- #[link_name = "llvm.x86.sse41.pblendw"]
- fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
#[link_name = "llvm.x86.sse41.insertps"]
fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
#[link_name = "llvm.x86.sse41.packusdw"]
@@ -1154,8 +1170,6 @@ extern "C" {
fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
#[link_name = "llvm.x86.sse41.phminposuw"]
fn phminposuw(a: u16x8) -> u16x8;
- #[link_name = "llvm.x86.sse41.pmuldq"]
- fn pmuldq(a: i32x4, b: i32x4) -> i64x2;
#[link_name = "llvm.x86.sse41.mpsadbw"]
fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
#[link_name = "llvm.x86.sse41.ptestz"]
@@ -1245,9 +1259,9 @@ mod tests {
#[simd_test(enable = "sse4.1")]
unsafe fn test_mm_extract_ps() {
let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
- let r: f32 = transmute(_mm_extract_ps::<1>(a));
+ let r: f32 = f32::from_bits(_mm_extract_ps::<1>(a) as u32);
assert_eq!(r, 1.0);
- let r: f32 = transmute(_mm_extract_ps::<3>(a));
+ let r: f32 = f32::from_bits(_mm_extract_ps::<3>(a) as u32);
assert_eq!(r, 3.0);
}
@@ -1668,6 +1682,7 @@ mod tests {
assert_eq_m128(r, e);
}
+ #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions
#[simd_test(enable = "sse4.1")]
unsafe fn test_mm_round_sd() {
let a = _mm_setr_pd(1.5, 3.5);
@@ -1680,6 +1695,7 @@ mod tests {
assert_eq_m128d(r, e);
}
+ #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions
#[simd_test(enable = "sse4.1")]
unsafe fn test_mm_round_ss() {
let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
diff --git a/library/stdarch/crates/core_arch/src/x86/test.rs b/library/stdarch/crates/core_arch/src/x86/test.rs
index ec4298033..50b2d93be 100644
--- a/library/stdarch/crates/core_arch/src/x86/test.rs
+++ b/library/stdarch/crates/core_arch/src/x86/test.rs
@@ -3,11 +3,13 @@
use crate::core_arch::x86::*;
use std::mem::transmute;
+#[track_caller]
#[target_feature(enable = "sse2")]
pub unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
}
+#[track_caller]
#[target_feature(enable = "sse2")]
pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
@@ -20,6 +22,7 @@ pub unsafe fn get_m128d(a: __m128d, idx: usize) -> f64 {
transmute::<_, [f64; 2]>(a)[idx]
}
+#[track_caller]
#[target_feature(enable = "sse")]
pub unsafe fn assert_eq_m128(a: __m128, b: __m128) {
let r = _mm_cmpeq_ps(a, b);
@@ -40,11 +43,13 @@ pub unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
_mm_set_epi64x(b, a)
}
+#[track_caller]
#[target_feature(enable = "avx")]
pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b))
}
+#[track_caller]
#[target_feature(enable = "avx")]
pub unsafe fn assert_eq_m256d(a: __m256d, b: __m256d) {
let cmp = _mm256_cmp_pd::<_CMP_EQ_OQ>(a, b);
@@ -58,6 +63,7 @@ pub unsafe fn get_m256d(a: __m256d, idx: usize) -> f64 {
transmute::<_, [f64; 4]>(a)[idx]
}
+#[track_caller]
#[target_feature(enable = "avx")]
pub unsafe fn assert_eq_m256(a: __m256, b: __m256) {
let cmp = _mm256_cmp_ps::<_CMP_EQ_OQ>(a, b);
@@ -125,10 +131,12 @@ mod x86_polyfill {
}
pub use self::x86_polyfill::*;
+#[track_caller]
pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) {
assert_eq!(transmute::<_, [i32; 16]>(a), transmute::<_, [i32; 16]>(b))
}
+#[track_caller]
pub unsafe fn assert_eq_m512(a: __m512, b: __m512) {
let cmp = _mm512_cmp_ps_mask::<_CMP_EQ_OQ>(a, b);
if cmp != 0b11111111_11111111 {
@@ -136,6 +144,7 @@ pub unsafe fn assert_eq_m512(a: __m512, b: __m512) {
}
}
+#[track_caller]
pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) {
let cmp = _mm512_cmp_pd_mask::<_CMP_EQ_OQ>(a, b);
if cmp != 0b11111111 {