summaryrefslogtreecommitdiffstats
path: root/library/stdarch/crates/core_arch/src/x86/avx2.rs
diff options
context:
space:
mode:
Diffstat (limited to 'library/stdarch/crates/core_arch/src/x86/avx2.rs')
-rw-r--r--library/stdarch/crates/core_arch/src/x86/avx2.rs5908
1 files changed, 5908 insertions, 0 deletions
diff --git a/library/stdarch/crates/core_arch/src/x86/avx2.rs b/library/stdarch/crates/core_arch/src/x86/avx2.rs
new file mode 100644
index 000000000..081609ece
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs
@@ -0,0 +1,5908 @@
+//! Advanced Vector Extensions 2 (AVX)
+//!
+//! AVX2 expands most AVX commands to 256-bit wide vector registers and
+//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//! Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//! System Instructions][amd64_ref].
+//!
+//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
+//! overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
+
+use crate::{
+ core_arch::{simd::*, simd_llvm::*, x86::*},
+ mem::transmute,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Computes the absolute values of packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpabsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
+ transmute(pabsd(a.as_i32x8()))
+}
+
+/// Computes the absolute values of packed 16-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpabsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
+ transmute(pabsw(a.as_i16x16()))
+}
+
+/// Computes the absolute values of packed 8-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpabsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i {
+ transmute(pabsb(a.as_i8x32()))
+}
+
+/// Adds packed 64-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_add(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Adds packed 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_add(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Adds packed 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_add(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Adds packed 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_add(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Adds packed 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Adds packed 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddusb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpaddusw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
+/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_alignr_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 7))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ // If palignr is shifting the pair of vectors more than the size of two
+ // lanes, emit zero.
+ if IMM8 > 32 {
+ return _mm256_set1_epi8(0);
+ }
+ // If palignr is shifting the pair of input vectors more than one lane,
+ // but less than two lanes, convert to shifting in zeroes.
+ let (a, b) = if IMM8 > 16 {
+ (_mm256_set1_epi8(0), a)
+ } else {
+ (a, b)
+ };
+
+ let a = a.as_i8x32();
+ let b = b.as_i8x32();
+
+ let r: i8x32 = match IMM8 % 16 {
+ 0 => simd_shuffle32!(
+ b,
+ a,
+ [
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+ 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ ],
+ ),
+ 1 => simd_shuffle32!(
+ b,
+ a,
+ [
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31, 48,
+ ],
+ ),
+ 2 => simd_shuffle32!(
+ b,
+ a,
+ [
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23, 24,
+ 25, 26, 27, 28, 29, 30, 31, 48, 49,
+ ],
+ ),
+ 3 => simd_shuffle32!(
+ b,
+ a,
+ [
+ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23, 24,
+ 25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
+ ],
+ ),
+ 4 => simd_shuffle32!(
+ b,
+ a,
+ [
+ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24, 25,
+ 26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
+ ],
+ ),
+ 5 => simd_shuffle32!(
+ b,
+ a,
+ [
+ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25, 26,
+ 27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
+ ],
+ ),
+ 6 => simd_shuffle32!(
+ b,
+ a,
+ [
+ 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26, 27,
+ 28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
+ ],
+ ),
+ 7 => simd_shuffle32!(
+ b,
+ a,
+ [
+ 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26, 27,
+ 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
+ ],
+ ),
+ 8 => simd_shuffle32!(
+ b,
+ a,
+ [
+ 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27, 28,
+ 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
+ ],
+ ),
+ 9 => simd_shuffle32!(
+ b,
+ a,
+ [
+ 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28, 29,
+ 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
+ ],
+ ),
+ 10 => simd_shuffle32!(
+ b,
+ a,
+ [
+ 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29, 30,
+ 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
+ ],
+ ),
+ 11 => simd_shuffle32!(
+ b,
+ a,
+ [
+ 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30, 31,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+ ],
+ ),
+ 12 => simd_shuffle32!(
+ b,
+ a,
+ [
+ 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31, 48,
+ 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ ],
+ ),
+ 13 => simd_shuffle32!(
+ b,
+ a,
+ [
+ 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48, 49,
+ 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+ ],
+ ),
+ 14 => simd_shuffle32!(
+ b,
+ a,
+ [
+ 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49, 50,
+ 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+ ],
+ ),
+ 15 => simd_shuffle32!(
+ b,
+ a,
+ [
+ 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50, 51,
+ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+ ],
+ ),
+ _ => b,
+ };
+ transmute(r)
+}
+
+/// Computes the bitwise AND of 256 bits (representing integer data)
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_and_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_and(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Computes the bitwise NOT of 256 bits (representing integer data)
+/// in `a` and then AND with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_andnot_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
+ let all_ones = _mm256_set1_epi8(-1);
+ transmute(simd_and(
+ simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
+ b.as_i64x4(),
+ ))
+}
+
+/// Averages packed unsigned 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpavgw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pavgw(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Averages packed unsigned 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpavgb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pavgb(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
+ static_assert_imm4!(IMM4);
+ let a = a.as_i32x4();
+ let b = b.as_i32x4();
+ let r: i32x4 = simd_shuffle4!(
+ a,
+ b,
+ <const IMM4: i32> [
+ [0, 4, 0, 4][IMM4 as usize & 0b11],
+ [1, 1, 5, 5][IMM4 as usize & 0b11],
+ [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
+ [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
+ ],
+ );
+ transmute(r)
+}
+
+/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ let a = a.as_i32x8();
+ let b = b.as_i32x8();
+ let r: i32x8 = simd_shuffle8!(
+ a,
+ b,
+ <const IMM8: i32> [
+ [0, 8, 0, 8][IMM8 as usize & 0b11],
+ [1, 1, 9, 9][IMM8 as usize & 0b11],
+ [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
+ [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
+ [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
+ [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
+ [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
+ [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
+ ],
+ );
+ transmute(r)
+}
+
+/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpblendw, IMM8 = 9))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ let a = a.as_i16x16();
+ let b = b.as_i16x16();
+
+ let r: i16x16 = simd_shuffle16!(
+ a,
+ b,
+ <const IMM8: i32> [
+ [0, 16, 0, 16][IMM8 as usize & 0b11],
+ [1, 1, 17, 17][IMM8 as usize & 0b11],
+ [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
+ [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
+ [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
+ [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
+ [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
+ [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
+ [8, 24, 8, 24][IMM8 as usize & 0b11],
+ [9, 9, 25, 25][IMM8 as usize & 0b11],
+ [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
+ [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
+ [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
+ [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
+ [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
+ [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
+ ],
+ );
+ transmute(r)
+}
+
+/// Blends packed 8-bit integers from `a` and `b` using `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blendv_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpblendvb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
+ transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32()))
+}
+
+/// Broadcasts the low packed 8-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastb_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
+ let zero = _mm_setzero_si128();
+ let ret = simd_shuffle16!(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]);
+ transmute::<i8x16, _>(ret)
+}
+
+/// Broadcasts the low packed 8-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastb_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpbroadcastb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
+ let zero = _mm_setzero_si128();
+ let ret = simd_shuffle32!(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]);
+ transmute::<i8x32, _>(ret)
+}
+
+// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
+// often compiled to `vbroadcastss`.
+/// Broadcasts the low packed 32-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastd_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
+ let zero = _mm_setzero_si128();
+ let ret = simd_shuffle4!(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]);
+ transmute::<i32x4, _>(ret)
+}
+
+// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
+// often compiled to `vbroadcastss`.
+/// Broadcasts the low packed 32-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastd_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
+ let zero = _mm_setzero_si128();
+ let ret = simd_shuffle8!(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]);
+ transmute::<i32x8, _>(ret)
+}
+
+/// Broadcasts the low packed 64-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastq_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+// FIXME: https://github.com/rust-lang/stdarch/issues/791
+#[cfg_attr(test, assert_instr(vmovddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
+ let ret = simd_shuffle2!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
+ transmute::<i64x2, _>(ret)
+}
+
+/// Broadcasts the low packed 64-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastq_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
+ let ret = simd_shuffle4!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
+ transmute::<i64x4, _>(ret)
+}
+
+/// Broadcasts the low double-precision (64-bit) floating-point element
+/// from `a` to all elements of the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastsd_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vmovddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
+ simd_shuffle2!(a, _mm_setzero_pd(), [0_u32; 2])
+}
+
+/// Broadcasts the low double-precision (64-bit) floating-point element
+/// from `a` to all elements of the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsd_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
+ simd_shuffle4!(a, _mm_setzero_pd(), [0_u32; 4])
+}
+
+// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
+// `vbroadcastf128`.
+/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsi128_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
+ let zero = _mm_setzero_si128();
+ let ret = simd_shuffle4!(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
+ transmute::<i64x4, _>(ret)
+}
+
+/// Broadcasts the low single-precision (32-bit) floating-point element
+/// from `a` to all elements of the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastss_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
+ simd_shuffle4!(a, _mm_setzero_ps(), [0_u32; 4])
+}
+
+/// Broadcasts the low single-precision (32-bit) floating-point element
+/// from `a` to all elements of the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastss_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
+ simd_shuffle8!(a, _mm_setzero_ps(), [0_u32; 8])
+}
+
+/// Broadcasts the low packed 16-bit integer from a to all elements of
+/// the 128-bit returned value
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastw_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
+ let zero = _mm_setzero_si128();
+ let ret = simd_shuffle8!(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]);
+ transmute::<i16x8, _>(ret)
+}
+
+/// Broadcasts the low packed 16-bit integer from a to all elements of
+/// the 256-bit returned value
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastw_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpbroadcastw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
+ let zero = _mm_setzero_si128();
+ let ret = simd_shuffle16!(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]);
+ transmute::<i16x16, _>(ret)
+}
+
+/// Compares packed 64-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpeqq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
+ transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpeqd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
+ transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpeqw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpeqb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
+ transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Compares packed 64-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpgtq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
+ transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpgtd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
+ transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpgtw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpcmpgtb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
+ transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Sign-extend 16-bit integers to 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
+ transmute::<i32x8, _>(simd_cast(a.as_i16x8()))
+}
+
+/// Sign-extend 16-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
+ let a = a.as_i16x8();
+ let v64: i16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
+ transmute::<i64x4, _>(simd_cast(v64))
+}
+
+/// Sign-extend 32-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
+ transmute::<i64x4, _>(simd_cast(a.as_i32x4()))
+}
+
+/// Sign-extend 8-bit integers to 16-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
+ transmute::<i16x16, _>(simd_cast(a.as_i8x16()))
+}
+
+/// Sign-extend 8-bit integers to 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
+ let a = a.as_i8x16();
+ let v64: i8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+ transmute::<i32x8, _>(simd_cast(v64))
+}
+
+/// Sign-extend 8-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
+ let a = a.as_i8x16();
+ let v32: i8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
+ transmute::<i64x4, _>(simd_cast(v32))
+}
+
+/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
+/// integers, and stores the results in `dst`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
+ transmute::<i32x8, _>(simd_cast(a.as_u16x8()))
+}
+
+/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
+/// integers. The upper four elements of `a` are unused.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
+ let a = a.as_u16x8();
+ let v64: u16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
+ transmute::<i64x4, _>(simd_cast(v64))
+}
+
+/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
+ transmute::<i64x4, _>(simd_cast(a.as_u32x4()))
+}
+
+/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
+ transmute::<i16x16, _>(simd_cast(a.as_u8x16()))
+}
+
+/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
+/// integers. The upper eight elements of `a` are unused.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
+ let a = a.as_u8x16();
+ let v64: u8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+ transmute::<i32x8, _>(simd_cast(v64))
+}
+
+/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
+/// integers. The upper twelve elements of `a` are unused.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
+ let a = a.as_u8x16();
+ let v32: u8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
+ transmute::<i64x4, _>(simd_cast(v32))
+}
+
+/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extracti128_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(
+ all(test, not(target_os = "windows")),
+ assert_instr(vextractf128, IMM1 = 1)
+)]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
+ static_assert_imm1!(IMM1);
+ let a = a.as_i64x4();
+ let b = _mm256_undefined_si256().as_i64x4();
+ let dst: i64x2 = simd_shuffle2!(a, b, <const IMM1: i32> [[0, 1], [2, 3]][IMM1 as usize]);
+ transmute(dst)
+}
+
+/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphaddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(phaddw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphaddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
+ transmute(phaddd(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
+/// using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadds_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphaddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(phaddsw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphsubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(phsubw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphsubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
+ transmute(phsubd(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
+/// using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsubs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vphsubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(phsubsw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i32gather_epi32<const SCALE: i32>(
+ slice: *const i32,
+ offsets: __m128i,
+) -> __m128i {
+ static_assert_imm8_scale!(SCALE);
+ let zero = _mm_setzero_si128().as_i32x4();
+ let neg_one = _mm_set1_epi32(-1).as_i32x4();
+ let offsets = offsets.as_i32x4();
+ let slice = slice as *const i8;
+ let r = pgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
+ transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i32gather_epi32<const SCALE: i32>(
+ src: __m128i,
+ slice: *const i32,
+ offsets: __m128i,
+ mask: __m128i,
+) -> __m128i {
+ static_assert_imm8_scale!(SCALE);
+ let src = src.as_i32x4();
+ let mask = mask.as_i32x4();
+ let offsets = offsets.as_i32x4();
+ let slice = slice as *const i8;
+ let r = pgatherdd(src, slice, offsets, mask, SCALE as i8);
+ transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i32gather_epi32<const SCALE: i32>(
+ slice: *const i32,
+ offsets: __m256i,
+) -> __m256i {
+ static_assert_imm8_scale!(SCALE);
+ let zero = _mm256_setzero_si256().as_i32x8();
+ let neg_one = _mm256_set1_epi32(-1).as_i32x8();
+ let offsets = offsets.as_i32x8();
+ let slice = slice as *const i8;
+ let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
+ transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i32gather_epi32<const SCALE: i32>(
+ src: __m256i,
+ slice: *const i32,
+ offsets: __m256i,
+ mask: __m256i,
+) -> __m256i {
+ static_assert_imm8_scale!(SCALE);
+ let src = src.as_i32x8();
+ let mask = mask.as_i32x8();
+ let offsets = offsets.as_i32x8();
+ let slice = slice as *const i8;
+ let r = vpgatherdd(src, slice, offsets, mask, SCALE as i8);
+ transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
+ static_assert_imm8_scale!(SCALE);
+ let zero = _mm_setzero_ps();
+ let neg_one = _mm_set1_ps(-1.0);
+ let offsets = offsets.as_i32x4();
+ let slice = slice as *const i8;
+ pgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i32gather_ps<const SCALE: i32>(
+ src: __m128,
+ slice: *const f32,
+ offsets: __m128i,
+ mask: __m128,
+) -> __m128 {
+ static_assert_imm8_scale!(SCALE);
+ let offsets = offsets.as_i32x4();
+ let slice = slice as *const i8;
+ pgatherdps(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m256 {
+ static_assert_imm8_scale!(SCALE);
+ let zero = _mm256_setzero_ps();
+ let neg_one = _mm256_set1_ps(-1.0);
+ let offsets = offsets.as_i32x8();
+ let slice = slice as *const i8;
+ vpgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i32gather_ps<const SCALE: i32>(
+ src: __m256,
+ slice: *const f32,
+ offsets: __m256i,
+ mask: __m256,
+) -> __m256 {
+ static_assert_imm8_scale!(SCALE);
+ let offsets = offsets.as_i32x8();
+ let slice = slice as *const i8;
+ vpgatherdps(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i32gather_epi64<const SCALE: i32>(
+ slice: *const i64,
+ offsets: __m128i,
+) -> __m128i {
+ static_assert_imm8_scale!(SCALE);
+ let zero = _mm_setzero_si128().as_i64x2();
+ let neg_one = _mm_set1_epi64x(-1).as_i64x2();
+ let offsets = offsets.as_i32x4();
+ let slice = slice as *const i8;
+ let r = pgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
+ transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i32gather_epi64<const SCALE: i32>(
+ src: __m128i,
+ slice: *const i64,
+ offsets: __m128i,
+ mask: __m128i,
+) -> __m128i {
+ static_assert_imm8_scale!(SCALE);
+ let src = src.as_i64x2();
+ let mask = mask.as_i64x2();
+ let offsets = offsets.as_i32x4();
+ let slice = slice as *const i8;
+ let r = pgatherdq(src, slice, offsets, mask, SCALE as i8);
+ transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i32gather_epi64<const SCALE: i32>(
+ slice: *const i64,
+ offsets: __m128i,
+) -> __m256i {
+ static_assert_imm8_scale!(SCALE);
+ let zero = _mm256_setzero_si256().as_i64x4();
+ let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
+ let offsets = offsets.as_i32x4();
+ let slice = slice as *const i8;
+ let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
+ transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i32gather_epi64<const SCALE: i32>(
+ src: __m256i,
+ slice: *const i64,
+ offsets: __m128i,
+ mask: __m256i,
+) -> __m256i {
+ static_assert_imm8_scale!(SCALE);
+ let src = src.as_i64x4();
+ let mask = mask.as_i64x4();
+ let offsets = offsets.as_i32x4();
+ let slice = slice as *const i8;
+ let r = vpgatherdq(src, slice, offsets, mask, SCALE as i8);
+ transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i32gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
+ static_assert_imm8_scale!(SCALE);
+ let zero = _mm_setzero_pd();
+ let neg_one = _mm_set1_pd(-1.0);
+ let offsets = offsets.as_i32x4();
+ let slice = slice as *const i8;
+ pgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i32gather_pd<const SCALE: i32>(
+ src: __m128d,
+ slice: *const f64,
+ offsets: __m128i,
+ mask: __m128d,
+) -> __m128d {
+ static_assert_imm8_scale!(SCALE);
+ let offsets = offsets.as_i32x4();
+ let slice = slice as *const i8;
+ pgatherdpd(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i32gather_pd<const SCALE: i32>(
+ slice: *const f64,
+ offsets: __m128i,
+) -> __m256d {
+ static_assert_imm8_scale!(SCALE);
+ let zero = _mm256_setzero_pd();
+ let neg_one = _mm256_set1_pd(-1.0);
+ let offsets = offsets.as_i32x4();
+ let slice = slice as *const i8;
+ vpgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i32gather_pd<const SCALE: i32>(
+ src: __m256d,
+ slice: *const f64,
+ offsets: __m128i,
+ mask: __m256d,
+) -> __m256d {
+ static_assert_imm8_scale!(SCALE);
+ let offsets = offsets.as_i32x4();
+ let slice = slice as *const i8;
+ vpgatherdpd(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i64gather_epi32<const SCALE: i32>(
+ slice: *const i32,
+ offsets: __m128i,
+) -> __m128i {
+ static_assert_imm8_scale!(SCALE);
+ let zero = _mm_setzero_si128().as_i32x4();
+ let neg_one = _mm_set1_epi64x(-1).as_i32x4();
+ let offsets = offsets.as_i64x2();
+ let slice = slice as *const i8;
+ let r = pgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
+ transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i64gather_epi32<const SCALE: i32>(
+ src: __m128i,
+ slice: *const i32,
+ offsets: __m128i,
+ mask: __m128i,
+) -> __m128i {
+ static_assert_imm8_scale!(SCALE);
+ let src = src.as_i32x4();
+ let mask = mask.as_i32x4();
+ let offsets = offsets.as_i64x2();
+ let slice = slice as *const i8;
+ let r = pgatherqd(src, slice, offsets, mask, SCALE as i8);
+ transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i64gather_epi32<const SCALE: i32>(
+ slice: *const i32,
+ offsets: __m256i,
+) -> __m128i {
+ static_assert_imm8_scale!(SCALE);
+ let zero = _mm_setzero_si128().as_i32x4();
+ let neg_one = _mm_set1_epi64x(-1).as_i32x4();
+ let offsets = offsets.as_i64x4();
+ let slice = slice as *const i8;
+ let r = vpgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
+ transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i64gather_epi32<const SCALE: i32>(
+ src: __m128i,
+ slice: *const i32,
+ offsets: __m256i,
+ mask: __m128i,
+) -> __m128i {
+ static_assert_imm8_scale!(SCALE);
+ let src = src.as_i32x4();
+ let mask = mask.as_i32x4();
+ let offsets = offsets.as_i64x4();
+ let slice = slice as *const i8;
+ let r = vpgatherqd(src, slice, offsets, mask, SCALE as i8);
+ transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
+ static_assert_imm8_scale!(SCALE);
+ let zero = _mm_setzero_ps();
+ let neg_one = _mm_set1_ps(-1.0);
+ let offsets = offsets.as_i64x2();
+ let slice = slice as *const i8;
+ pgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i64gather_ps<const SCALE: i32>(
+ src: __m128,
+ slice: *const f32,
+ offsets: __m128i,
+ mask: __m128,
+) -> __m128 {
+ static_assert_imm8_scale!(SCALE);
+ let offsets = offsets.as_i64x2();
+ let slice = slice as *const i8;
+ pgatherqps(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m128 {
+ static_assert_imm8_scale!(SCALE);
+ let zero = _mm_setzero_ps();
+ let neg_one = _mm_set1_ps(-1.0);
+ let offsets = offsets.as_i64x4();
+ let slice = slice as *const i8;
+ vpgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i64gather_ps<const SCALE: i32>(
+ src: __m128,
+ slice: *const f32,
+ offsets: __m256i,
+ mask: __m128,
+) -> __m128 {
+ static_assert_imm8_scale!(SCALE);
+ let offsets = offsets.as_i64x4();
+ let slice = slice as *const i8;
+ vpgatherqps(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i64gather_epi64<const SCALE: i32>(
+ slice: *const i64,
+ offsets: __m128i,
+) -> __m128i {
+ static_assert_imm8_scale!(SCALE);
+ let zero = _mm_setzero_si128().as_i64x2();
+ let neg_one = _mm_set1_epi64x(-1).as_i64x2();
+ let slice = slice as *const i8;
+ let offsets = offsets.as_i64x2();
+ let r = pgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
+ transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i64gather_epi64<const SCALE: i32>(
+ src: __m128i,
+ slice: *const i64,
+ offsets: __m128i,
+ mask: __m128i,
+) -> __m128i {
+ static_assert_imm8_scale!(SCALE);
+ let src = src.as_i64x2();
+ let mask = mask.as_i64x2();
+ let offsets = offsets.as_i64x2();
+ let slice = slice as *const i8;
+ let r = pgatherqq(src, slice, offsets, mask, SCALE as i8);
+ transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i64gather_epi64<const SCALE: i32>(
+ slice: *const i64,
+ offsets: __m256i,
+) -> __m256i {
+ static_assert_imm8_scale!(SCALE);
+ let zero = _mm256_setzero_si256().as_i64x4();
+ let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
+ let slice = slice as *const i8;
+ let offsets = offsets.as_i64x4();
+ let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
+ transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i64gather_epi64<const SCALE: i32>(
+ src: __m256i,
+ slice: *const i64,
+ offsets: __m256i,
+ mask: __m256i,
+) -> __m256i {
+ static_assert_imm8_scale!(SCALE);
+ let src = src.as_i64x4();
+ let mask = mask.as_i64x4();
+ let offsets = offsets.as_i64x4();
+ let slice = slice as *const i8;
+ let r = vpgatherqq(src, slice, offsets, mask, SCALE as i8);
+ transmute(r)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_i64gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
+ static_assert_imm8_scale!(SCALE);
+ let zero = _mm_setzero_pd();
+ let neg_one = _mm_set1_pd(-1.0);
+ let slice = slice as *const i8;
+ let offsets = offsets.as_i64x2();
+ pgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_mask_i64gather_pd<const SCALE: i32>(
+ src: __m128d,
+ slice: *const f64,
+ offsets: __m128i,
+ mask: __m128d,
+) -> __m128d {
+ static_assert_imm8_scale!(SCALE);
+ let slice = slice as *const i8;
+ let offsets = offsets.as_i64x2();
+ pgatherqpd(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_i64gather_pd<const SCALE: i32>(
+ slice: *const f64,
+ offsets: __m256i,
+) -> __m256d {
+ static_assert_imm8_scale!(SCALE);
+ let zero = _mm256_setzero_pd();
+ let neg_one = _mm256_set1_pd(-1.0);
+ let slice = slice as *const i8;
+ let offsets = offsets.as_i64x4();
+ vpgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
+}
+
+/// Returns values from `slice` at offsets determined by `offsets * scale`,
+/// where
+/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
+/// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
+#[rustc_legacy_const_generics(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mask_i64gather_pd<const SCALE: i32>(
+ src: __m256d,
+ slice: *const f64,
+ offsets: __m256i,
+ mask: __m256d,
+) -> __m256d {
+ static_assert_imm8_scale!(SCALE);
+ let slice = slice as *const i8;
+ let offsets = offsets.as_i64x4();
+ vpgatherqpd(src, slice, offsets, mask, SCALE as i8)
+}
+
+/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
+/// location specified by `IMM1`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_inserti128_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(
+ all(test, not(target_os = "windows")),
+ assert_instr(vinsertf128, IMM1 = 1)
+)]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
+ static_assert_imm1!(IMM1);
+ let a = a.as_i64x4();
+ let b = _mm256_castsi128_si256(b).as_i64x4();
+ let dst: i64x4 =
+ simd_shuffle4!(a, b, <const IMM1: i32> [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
+ transmute(dst)
+}
+
+/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
+/// of intermediate 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaddwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pmaddwd(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Vertically multiplies each unsigned 8-bit integer from `a` with the
+/// corresponding signed 8-bit integer from `b`, producing intermediate
+/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
+/// signed 16-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maddubs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaddubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
+/// (elements are zeroed out when the highest bit is not set in the
+/// corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
+ transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
+}
+
+/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
+/// (elements are zeroed out when the highest bit is not set in the
+/// corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
+ transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
+}
+
+/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
+/// (elements are zeroed out when the highest bit is not set in the
+/// corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
+ transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
+}
+
+/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
+/// (elements are zeroed out when the highest bit is not set in the
+/// corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
+ transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
+}
+
+/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
+/// using `mask` (elements are not stored when the highest bit is not set
+/// in the corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
+ maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
+}
+
+/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
+/// using `mask` (elements are not stored when the highest bit is not set
+/// in the corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
+ maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
+}
+
+/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
+/// using `mask` (elements are not stored when the highest bit is not set
+/// in the corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
+ maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
+}
+
+/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
+/// using `mask` (elements are not stored when the highest bit is not set
+/// in the corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
+ maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
+}
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pmaxsw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pmaxsd(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pmaxsb(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pmaxuw(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pmaxud(a.as_u32x8(), b.as_u32x8()))
+}
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmaxub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pmaxub(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pminsw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pminsd(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pminsb(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pminuw(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pminud(a.as_u32x8(), b.as_u32x8()))
+}
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpminub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pminub(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Creates mask from the most significant bit of each 8-bit element in `a`,
+/// return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movemask_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmovmskb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
+ pmovmskb(a.as_i8x32())
+}
+
+/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
+/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
+/// results in dst. Eight SADs are performed for each 128-bit lane using one
+/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
+/// selected from `b` starting at on the offset specified in `imm8`. Eight
+/// quadruplets are formed from sequential 8-bit integers selected from `a`
+/// starting at the offset specified in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mpsadbw_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vmpsadbw, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8))
+}
+
+/// Multiplies the low 32-bit integers from each packed 64-bit element in
+/// `a` and `b`
+///
+/// Returns the 64-bit results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmuldq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pmuldq(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
+/// element in `a` and `b`
+///
+/// Returns the unsigned 64-bit results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epu32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmuludq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pmuludq(a.as_u32x8(), b.as_u32x8()))
+}
+
+/// Multiplies the packed 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers and returning the high 16 bits of the
+/// intermediate integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmulhw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pmulhw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers and returning the high 16 bits of the
+/// intermediate integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmulhuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pmulhuw(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Multiplies the packed 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers, and returns the low 16 bits of the
+/// intermediate integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmullw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_mul(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Multiplies the packed 32-bit integers in `a` and `b`, producing
+/// intermediate 64-bit integers, and returns the low 32 bits of the
+/// intermediate integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmulld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_mul(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiplies packed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Truncate each intermediate
+/// integer to the 18 most significant bits, round by adding 1, and
+/// return bits `[16:1]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhrs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpmulhrsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
+/// and `b`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using signed saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpacksswb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(packsswb(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using signed saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpackssdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
+ transmute(packssdw(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using unsigned saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpackuswb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(packuswb(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using unsigned saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpackusdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
+ transmute(packusdw(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Permutes packed 32-bit integers from `a` according to the content of `b`.
+///
+/// The last 3 bits of each integer of `b` are used as addresses into the 8
+/// integers of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
+ transmute(permd(a.as_u32x8(), b.as_u32x8()))
+}
+
+/// Permutes 64-bit integers from `a` using control mask `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ let zero = _mm256_setzero_si256().as_i64x4();
+ let r: i64x4 = simd_shuffle4!(
+ a.as_i64x4(),
+ zero,
+ <const IMM8: i32> [
+ IMM8 as u32 & 0b11,
+ (IMM8 as u32 >> 2) & 0b11,
+ (IMM8 as u32 >> 4) & 0b11,
+ (IMM8 as u32 >> 6) & 0b11,
+ ],
+ );
+ transmute(r)
+}
+
+/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2x128_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 9))]
+#[rustc_legacy_const_generics(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8))
+}
+
+/// Shuffles 64-bit floating-point elements in `a` across lanes using the
+/// control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_pd)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
+ static_assert_imm8!(IMM8);
+ simd_shuffle4!(
+ a,
+ _mm256_undefined_pd(),
+ <const IMM8: i32> [
+ IMM8 as u32 & 0b11,
+ (IMM8 as u32 >> 2) & 0b11,
+ (IMM8 as u32 >> 4) & 0b11,
+ (IMM8 as u32 >> 6) & 0b11,
+ ],
+ )
+}
+
+/// Shuffles eight 32-bit foating-point elements in `a` across lanes using
+/// the corresponding 32-bit integer index in `idx`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_ps)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
+ permps(a, idx.as_i32x8())
+}
+
+/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
+/// and `b`, then horizontally sum each consecutive 8 differences to
+/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
+/// integers in the low 16 bits of the 64-bit return value
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sad_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsadbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
+ transmute(psadbw(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Shuffles bytes from `a` according to the content of `b`.
+///
+/// For each of the 128-bit low and high halves of the vectors, the last
+/// 4 bits of each byte of `b` are used as addresses into the respective
+/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
+///
+/// In addition, if the highest significant bit of a byte of `b` is set, the
+/// respective destination byte is set to 0.
+///
+/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
+/// equivalent to:
+///
+/// ```
+/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
+/// let mut r = [0; 32];
+/// for i in 0..16 {
+/// // if the most significant bit of b is set,
+/// // then the destination byte is set to 0.
+/// if b[i] & 0x80 == 0u8 {
+/// r[i] = a[(b[i] % 16) as usize];
+/// }
+/// if b[i + 16] & 0x80 == 0u8 {
+/// r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
+/// }
+/// }
+/// r
+/// }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpshufb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
+ transmute(pshufb(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
+/// `imm8`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// # if is_x86_feature_detected!("avx2") {
+/// # #[target_feature(enable = "avx2")]
+/// # unsafe fn worker() {
+/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+///
+/// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
+/// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
+///
+/// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
+/// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
+///
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
+/// # }
+/// # unsafe { worker(); }
+/// # }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpermilps, MASK = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
+ static_assert_imm8!(MASK);
+ let r: i32x8 = simd_shuffle8!(
+ a.as_i32x8(),
+ a.as_i32x8(),
+ <const MASK: i32> [
+ MASK as u32 & 0b11,
+ (MASK as u32 >> 2) & 0b11,
+ (MASK as u32 >> 4) & 0b11,
+ (MASK as u32 >> 6) & 0b11,
+ (MASK as u32 & 0b11) + 4,
+ ((MASK as u32 >> 2) & 0b11) + 4,
+ ((MASK as u32 >> 4) & 0b11) + 4,
+ ((MASK as u32 >> 6) & 0b11) + 4,
+ ],
+ );
+ transmute(r)
+}
+
+/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
+/// to the output.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflehi_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ let a = a.as_i16x16();
+ let r: i16x16 = simd_shuffle16!(
+ a,
+ a,
+ <const IMM8: i32> [
+ 0,
+ 1,
+ 2,
+ 3,
+ 4 + (IMM8 as u32 & 0b11),
+ 4 + ((IMM8 as u32 >> 2) & 0b11),
+ 4 + ((IMM8 as u32 >> 4) & 0b11),
+ 4 + ((IMM8 as u32 >> 6) & 0b11),
+ 8,
+ 9,
+ 10,
+ 11,
+ 12 + (IMM8 as u32 & 0b11),
+ 12 + ((IMM8 as u32 >> 2) & 0b11),
+ 12 + ((IMM8 as u32 >> 4) & 0b11),
+ 12 + ((IMM8 as u32 >> 6) & 0b11),
+ ],
+ );
+ transmute(r)
+}
+
+/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
+/// to the output.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflelo_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 9))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ let a = a.as_i16x16();
+ let r: i16x16 = simd_shuffle16!(
+ a,
+ a,
+ <const IMM8: i32> [
+ 0 + (IMM8 as u32 & 0b11),
+ 0 + ((IMM8 as u32 >> 2) & 0b11),
+ 0 + ((IMM8 as u32 >> 4) & 0b11),
+ 0 + ((IMM8 as u32 >> 6) & 0b11),
+ 4,
+ 5,
+ 6,
+ 7,
+ 8 + (IMM8 as u32 & 0b11),
+ 8 + ((IMM8 as u32 >> 2) & 0b11),
+ 8 + ((IMM8 as u32 >> 4) & 0b11),
+ 8 + ((IMM8 as u32 >> 6) & 0b11),
+ 12,
+ 13,
+ 14,
+ 15,
+ ],
+ );
+ transmute(r)
+}
+
+/// Negates packed 16-bit integers in `a` when the corresponding signed
+/// 16-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsignw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(psignw(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Negates packed 32-bit integers in `a` when the corresponding signed
+/// 32-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsignd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
+ transmute(psignd(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Negates packed 8-bit integers in `a` when the corresponding signed
+/// 8-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsignb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
+ transmute(psignb(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Shifts packed 16-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
+ transmute(psllw(a.as_i16x16(), count.as_i16x8()))
+}
+
+/// Shifts packed 32-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpslld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
+ transmute(pslld(a.as_i32x8(), count.as_i32x4()))
+}
+
+/// Shifts packed 64-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
+ transmute(psllq(a.as_i64x4(), count.as_i64x2()))
+}
+
+/// Shifts packed 16-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ transmute(pslliw(a.as_i16x16(), IMM8))
+}
+
+/// Shifts packed 32-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpslld, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ transmute(psllid(a.as_i32x8(), IMM8))
+}
+
+/// Shifts packed 64-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ transmute(pslliq(a.as_i64x4(), IMM8))
+}
+
+/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ _mm256_bslli_epi128::<IMM8>(a)
+}
+
+/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bslli_epi128)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ const fn mask(shift: i32, i: u32) -> u32 {
+ let shift = shift as u32 & 0xff;
+ if shift > 15 || i % 16 < shift {
+ 0
+ } else {
+ 32 + (i - shift)
+ }
+ }
+ let a = a.as_i8x32();
+ let zero = _mm256_setzero_si256().as_i8x32();
+ let r: i8x32 = simd_shuffle32!(
+ zero,
+ a,
+ <const IMM8: i32> [
+ mask(IMM8, 0),
+ mask(IMM8, 1),
+ mask(IMM8, 2),
+ mask(IMM8, 3),
+ mask(IMM8, 4),
+ mask(IMM8, 5),
+ mask(IMM8, 6),
+ mask(IMM8, 7),
+ mask(IMM8, 8),
+ mask(IMM8, 9),
+ mask(IMM8, 10),
+ mask(IMM8, 11),
+ mask(IMM8, 12),
+ mask(IMM8, 13),
+ mask(IMM8, 14),
+ mask(IMM8, 15),
+ mask(IMM8, 16),
+ mask(IMM8, 17),
+ mask(IMM8, 18),
+ mask(IMM8, 19),
+ mask(IMM8, 20),
+ mask(IMM8, 21),
+ mask(IMM8, 22),
+ mask(IMM8, 23),
+ mask(IMM8, 24),
+ mask(IMM8, 25),
+ mask(IMM8, 26),
+ mask(IMM8, 27),
+ mask(IMM8, 28),
+ mask(IMM8, 29),
+ mask(IMM8, 30),
+ mask(IMM8, 31),
+ ],
+ );
+ transmute(r)
+}
+
+/// Shifts packed 32-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
+ transmute(psllvd(a.as_i32x4(), count.as_i32x4()))
+}
+
+/// Shifts packed 32-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
+ transmute(psllvd256(a.as_i32x8(), count.as_i32x8()))
+}
+
+/// Shifts packed 64-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
+ transmute(psllvq(a.as_i64x2(), count.as_i64x2()))
+}
+
+/// Shifts packed 64-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsllvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
+ transmute(psllvq256(a.as_i64x4(), count.as_i64x4()))
+}
+
+/// Shifts packed 16-bit integers in `a` right by `count` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsraw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
+ transmute(psraw(a.as_i16x16(), count.as_i16x8()))
+}
+
+/// Shifts packed 32-bit integers in `a` right by `count` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrad))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
+ transmute(psrad(a.as_i32x8(), count.as_i32x4()))
+}
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ transmute(psraiw(a.as_i16x16(), IMM8))
+}
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ transmute(psraid(a.as_i32x8(), IMM8))
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by the
+/// corresponding element in `count` while shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srav_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
+ transmute(psravd(a.as_i32x4(), count.as_i32x4()))
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by the
+/// corresponding element in `count` while shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srav_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsravd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
+ transmute(psravd256(a.as_i32x8(), count.as_i32x8()))
+}
+
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ _mm256_bsrli_epi128::<IMM8>(a)
+}
+
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bsrli_epi128)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ let a = a.as_i8x32();
+ let zero = _mm256_setzero_si256().as_i8x32();
+ let r: i8x32 = match IMM8 % 16 {
+ 0 => simd_shuffle32!(
+ a,
+ zero,
+ [
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+ 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ ],
+ ),
+ 1 => simd_shuffle32!(
+ a,
+ zero,
+ [
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ ],
+ ),
+ 2 => simd_shuffle32!(
+ a,
+ zero,
+ [
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 18, 19, 20, 21, 22, 23, 24,
+ 25, 26, 27, 28, 29, 30, 31, 32, 32,
+ ],
+ ),
+ 3 => simd_shuffle32!(
+ a,
+ zero,
+ [
+ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 19, 20, 21, 22, 23, 24,
+ 25, 26, 27, 28, 29, 30, 31, 32, 32, 32,
+ ],
+ ),
+ 4 => simd_shuffle32!(
+ a,
+ zero,
+ [
+ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 20, 21, 22, 23, 24, 25,
+ 26, 27, 28, 29, 30, 31, 32, 32, 32, 32,
+ ],
+ ),
+ 5 => simd_shuffle32!(
+ a,
+ zero,
+ [
+ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 21, 22, 23, 24, 25, 26,
+ 27, 28, 29, 30, 31, 32, 32, 32, 32, 32,
+ ],
+ ),
+ 6 => simd_shuffle32!(
+ a,
+ zero,
+ [
+ 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 22, 23, 24, 25, 26, 27,
+ 28, 29, 30, 31, 32, 32, 32, 32, 32, 32,
+ ],
+ ),
+ 7 => simd_shuffle32!(
+ a,
+ zero,
+ [
+ 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 23, 24, 25, 26, 27,
+ 28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32,
+ ],
+ ),
+ 8 => simd_shuffle32!(
+ a,
+ zero,
+ [
+ 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 24, 25, 26, 27, 28,
+ 29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ ],
+ ),
+ 9 => simd_shuffle32!(
+ a,
+ zero,
+ [
+ 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 25, 26, 27, 28, 29,
+ 30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ ],
+ ),
+ 10 => simd_shuffle32!(
+ a,
+ zero,
+ [
+ 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 26, 27, 28, 29, 30,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ ],
+ ),
+ 11 => simd_shuffle32!(
+ a,
+ zero,
+ [
+ 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 27, 28, 29, 30, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ ],
+ ),
+ 12 => simd_shuffle32!(
+ a,
+ zero,
+ [
+ 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 28, 29, 30, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ ],
+ ),
+ 13 => simd_shuffle32!(
+ a,
+ zero,
+ [
+ 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 29, 30, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ ],
+ ),
+ 14 => simd_shuffle32!(
+ a,
+ zero,
+ [
+ 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 30, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ ],
+ ),
+ 15 => simd_shuffle32!(
+ a,
+ zero,
+ [
+ 14, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ ],
+ ),
+ _ => zero,
+ };
+ transmute(r)
+}
+
+/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
+ transmute(psrlw(a.as_i16x16(), count.as_i16x8()))
+}
+
+/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
+ transmute(psrld(a.as_i32x8(), count.as_i32x4()))
+}
+
+/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
+ transmute(psrlq(a.as_i64x4(), count.as_i64x2()))
+}
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ transmute(psrliw(a.as_i16x16(), IMM8))
+}
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ transmute(psrlid(a.as_i32x8(), IMM8))
+}
+
+/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 7))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+ static_assert_imm8!(IMM8);
+ transmute(psrliq(a.as_i64x4(), IMM8))
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
+ transmute(psrlvd(a.as_i32x4(), count.as_i32x4()))
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
+ transmute(psrlvd256(a.as_i32x8(), count.as_i32x8()))
+}
+
+/// Shifts packed 64-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
+ transmute(psrlvq(a.as_i64x2(), count.as_i64x2()))
+}
+
+/// Shifts packed 64-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrlvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
+ transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
+}
+
+// TODO _mm256_stream_load_si256 (__m256i const* mem_addr)
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_sub(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_sub(a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_sub(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_sub(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
+/// `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16()))
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
+/// `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32()))
+}
+
+/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubusw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16()))
+}
+
+/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsubusb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32()))
+}
+
+/// Unpacks and interleave 8-bit integers from the high half of each
+/// 128-bit lane in `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// # if is_x86_feature_detected!("avx2") {
+/// # #[target_feature(enable = "avx2")]
+/// # unsafe fn worker() {
+/// let a = _mm256_setr_epi8(
+/// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+/// 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+/// );
+/// let b = _mm256_setr_epi8(
+/// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+/// -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
+/// -30, -31,
+/// );
+///
+/// let c = _mm256_unpackhi_epi8(a, b);
+///
+/// let expected = _mm256_setr_epi8(
+/// 8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
+/// 24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
+/// -31,
+/// );
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// # }
+/// # unsafe { worker(); }
+/// # }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpunpckhbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
+ #[rustfmt::skip]
+ let r: i8x32 = simd_shuffle32!(a.as_i8x32(), b.as_i8x32(), [
+ 8, 40, 9, 41, 10, 42, 11, 43,
+ 12, 44, 13, 45, 14, 46, 15, 47,
+ 24, 56, 25, 57, 26, 58, 27, 59,
+ 28, 60, 29, 61, 30, 62, 31, 63,
+ ]);
+ transmute(r)
+}
+
+/// Unpacks and interleave 8-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// # if is_x86_feature_detected!("avx2") {
+/// # #[target_feature(enable = "avx2")]
+/// # unsafe fn worker() {
+/// let a = _mm256_setr_epi8(
+/// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+/// 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+/// );
+/// let b = _mm256_setr_epi8(
+/// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+/// -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
+/// -30, -31,
+/// );
+///
+/// let c = _mm256_unpacklo_epi8(a, b);
+///
+/// let expected = _mm256_setr_epi8(
+/// 0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
+/// -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
+/// );
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// # }
+/// # unsafe { worker(); }
+/// # }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
+ #[rustfmt::skip]
+ let r: i8x32 = simd_shuffle32!(a.as_i8x32(), b.as_i8x32(), [
+ 0, 32, 1, 33, 2, 34, 3, 35,
+ 4, 36, 5, 37, 6, 38, 7, 39,
+ 16, 48, 17, 49, 18, 50, 19, 51,
+ 20, 52, 21, 53, 22, 54, 23, 55,
+ ]);
+ transmute(r)
+}
+
+/// Unpacks and interleave 16-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// # if is_x86_feature_detected!("avx2") {
+/// # #[target_feature(enable = "avx2")]
+/// # unsafe fn worker() {
+/// let a = _mm256_setr_epi16(
+/// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+/// );
+/// let b = _mm256_setr_epi16(
+/// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+/// );
+///
+/// let c = _mm256_unpackhi_epi16(a, b);
+///
+/// let expected = _mm256_setr_epi16(
+/// 4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
+/// );
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// # }
+/// # unsafe { worker(); }
+/// # }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpunpckhwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
+ let r: i16x16 = simd_shuffle16!(
+ a.as_i16x16(),
+ b.as_i16x16(),
+ [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
+ );
+ transmute(r)
+}
+
+/// Unpacks and interleave 16-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// # if is_x86_feature_detected!("avx2") {
+/// # #[target_feature(enable = "avx2")]
+/// # unsafe fn worker() {
+///
+/// let a = _mm256_setr_epi16(
+/// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+/// );
+/// let b = _mm256_setr_epi16(
+/// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
+/// );
+///
+/// let c = _mm256_unpacklo_epi16(a, b);
+///
+/// let expected = _mm256_setr_epi16(
+/// 0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
+/// );
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// # }
+/// # unsafe { worker(); }
+/// # }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpunpcklwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
+ let r: i16x16 = simd_shuffle16!(
+ a.as_i16x16(),
+ b.as_i16x16(),
+ [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
+ );
+ transmute(r)
+}
+
+/// Unpacks and interleave 32-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// # if is_x86_feature_detected!("avx2") {
+/// # #[target_feature(enable = "avx2")]
+/// # unsafe fn worker() {
+/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
+///
+/// let c = _mm256_unpackhi_epi32(a, b);
+///
+/// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// # }
+/// # unsafe { worker(); }
+/// # }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
+ let r: i32x8 = simd_shuffle8!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
+ transmute(r)
+}
+
+/// Unpacks and interleave 32-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// # if is_x86_feature_detected!("avx2") {
+/// # #[target_feature(enable = "avx2")]
+/// # unsafe fn worker() {
+/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
+///
+/// let c = _mm256_unpacklo_epi32(a, b);
+///
+/// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// # }
+/// # unsafe { worker(); }
+/// # }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
+ let r: i32x8 = simd_shuffle8!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
+ transmute(r)
+}
+
+/// Unpacks and interleave 64-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// # if is_x86_feature_detected!("avx2") {
+/// # #[target_feature(enable = "avx2")]
+/// # unsafe fn worker() {
+/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
+/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
+///
+/// let c = _mm256_unpackhi_epi64(a, b);
+///
+/// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// # }
+/// # unsafe { worker(); }
+/// # }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
+ let r: i64x4 = simd_shuffle4!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
+ transmute(r)
+}
+
+/// Unpacks and interleave 64-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+///
+/// ```rust
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::*;
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::*;
+///
+/// # fn main() {
+/// # if is_x86_feature_detected!("avx2") {
+/// # #[target_feature(enable = "avx2")]
+/// # unsafe fn worker() {
+/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
+/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
+///
+/// let c = _mm256_unpacklo_epi64(a, b);
+///
+/// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
+/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
+///
+/// # }
+/// # unsafe { worker(); }
+/// # }
+/// # }
+/// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi64)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
+ let r: i64x4 = simd_shuffle4!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
+ transmute(r)
+}
+
+/// Computes the bitwise XOR of 256 bits (representing integer data)
+/// in `a` and `b`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_si256)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
+ transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
+}
+
+/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi8)
+#[inline]
+#[target_feature(enable = "avx2")]
+// This intrinsic has no corresponding instruction.
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
+ static_assert_imm5!(INDEX);
+ simd_extract::<_, u8>(a.as_u8x32(), INDEX as u32) as i32
+}
+
+/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi16)
+#[inline]
+#[target_feature(enable = "avx2")]
+// This intrinsic has no corresponding instruction.
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
+ static_assert_imm4!(INDEX);
+ simd_extract::<_, u16>(a.as_u16x16(), INDEX as u32) as i32
+}
+
+/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi32)
+#[inline]
+#[target_feature(enable = "avx2")]
+// This intrinsic has no corresponding instruction.
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
+ static_assert_imm3!(INDEX);
+ simd_extract(a.as_i32x8(), INDEX as u32)
+}
+
+/// Returns the first element of the input vector of `[4 x double]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsd_f64)
+#[inline]
+#[target_feature(enable = "avx2")]
+//#[cfg_attr(test, assert_instr(movsd))] FIXME
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
+ simd_extract(a, 0)
+}
+
+/// Returns the first element of the input vector of `[8 x i32]`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsi256_si32)
+#[inline]
+#[target_feature(enable = "avx2")]
+//#[cfg_attr(test, assert_instr(movd))] FIXME
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub unsafe fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
+ simd_extract(a.as_i32x8(), 0)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+ #[link_name = "llvm.x86.avx2.pabs.b"]
+ fn pabsb(a: i8x32) -> u8x32;
+ #[link_name = "llvm.x86.avx2.pabs.w"]
+ fn pabsw(a: i16x16) -> u16x16;
+ #[link_name = "llvm.x86.avx2.pabs.d"]
+ fn pabsd(a: i32x8) -> u32x8;
+ #[link_name = "llvm.x86.avx2.pavg.b"]
+ fn pavgb(a: u8x32, b: u8x32) -> u8x32;
+ #[link_name = "llvm.x86.avx2.pavg.w"]
+ fn pavgw(a: u16x16, b: u16x16) -> u16x16;
+ #[link_name = "llvm.x86.avx2.pblendvb"]
+ fn pblendvb(a: i8x32, b: i8x32, mask: i8x32) -> i8x32;
+ #[link_name = "llvm.x86.avx2.phadd.w"]
+ fn phaddw(a: i16x16, b: i16x16) -> i16x16;
+ #[link_name = "llvm.x86.avx2.phadd.d"]
+ fn phaddd(a: i32x8, b: i32x8) -> i32x8;
+ #[link_name = "llvm.x86.avx2.phadd.sw"]
+ fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
+ #[link_name = "llvm.x86.avx2.phsub.w"]
+ fn phsubw(a: i16x16, b: i16x16) -> i16x16;
+ #[link_name = "llvm.x86.avx2.phsub.d"]
+ fn phsubd(a: i32x8, b: i32x8) -> i32x8;
+ #[link_name = "llvm.x86.avx2.phsub.sw"]
+ fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
+ #[link_name = "llvm.x86.avx2.pmadd.wd"]
+ fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
+ #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
+ fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
+ #[link_name = "llvm.x86.avx2.maskload.d"]
+ fn maskloadd(mem_addr: *const i8, mask: i32x4) -> i32x4;
+ #[link_name = "llvm.x86.avx2.maskload.d.256"]
+ fn maskloadd256(mem_addr: *const i8, mask: i32x8) -> i32x8;
+ #[link_name = "llvm.x86.avx2.maskload.q"]
+ fn maskloadq(mem_addr: *const i8, mask: i64x2) -> i64x2;
+ #[link_name = "llvm.x86.avx2.maskload.q.256"]
+ fn maskloadq256(mem_addr: *const i8, mask: i64x4) -> i64x4;
+ #[link_name = "llvm.x86.avx2.maskstore.d"]
+ fn maskstored(mem_addr: *mut i8, mask: i32x4, a: i32x4);
+ #[link_name = "llvm.x86.avx2.maskstore.d.256"]
+ fn maskstored256(mem_addr: *mut i8, mask: i32x8, a: i32x8);
+ #[link_name = "llvm.x86.avx2.maskstore.q"]
+ fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2);
+ #[link_name = "llvm.x86.avx2.maskstore.q.256"]
+ fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
+ #[link_name = "llvm.x86.avx2.pmaxs.w"]
+ fn pmaxsw(a: i16x16, b: i16x16) -> i16x16;
+ #[link_name = "llvm.x86.avx2.pmaxs.d"]
+ fn pmaxsd(a: i32x8, b: i32x8) -> i32x8;
+ #[link_name = "llvm.x86.avx2.pmaxs.b"]
+ fn pmaxsb(a: i8x32, b: i8x32) -> i8x32;
+ #[link_name = "llvm.x86.avx2.pmaxu.w"]
+ fn pmaxuw(a: u16x16, b: u16x16) -> u16x16;
+ #[link_name = "llvm.x86.avx2.pmaxu.d"]
+ fn pmaxud(a: u32x8, b: u32x8) -> u32x8;
+ #[link_name = "llvm.x86.avx2.pmaxu.b"]
+ fn pmaxub(a: u8x32, b: u8x32) -> u8x32;
+ #[link_name = "llvm.x86.avx2.pmins.w"]
+ fn pminsw(a: i16x16, b: i16x16) -> i16x16;
+ #[link_name = "llvm.x86.avx2.pmins.d"]
+ fn pminsd(a: i32x8, b: i32x8) -> i32x8;
+ #[link_name = "llvm.x86.avx2.pmins.b"]
+ fn pminsb(a: i8x32, b: i8x32) -> i8x32;
+ #[link_name = "llvm.x86.avx2.pminu.w"]
+ fn pminuw(a: u16x16, b: u16x16) -> u16x16;
+ #[link_name = "llvm.x86.avx2.pminu.d"]
+ fn pminud(a: u32x8, b: u32x8) -> u32x8;
+ #[link_name = "llvm.x86.avx2.pminu.b"]
+ fn pminub(a: u8x32, b: u8x32) -> u8x32;
+ #[link_name = "llvm.x86.avx2.pmovmskb"]
+ fn pmovmskb(a: i8x32) -> i32;
+ #[link_name = "llvm.x86.avx2.mpsadbw"]
+ fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
+ #[link_name = "llvm.x86.avx2.pmulhu.w"]
+ fn pmulhuw(a: u16x16, b: u16x16) -> u16x16;
+ #[link_name = "llvm.x86.avx2.pmulh.w"]
+ fn pmulhw(a: i16x16, b: i16x16) -> i16x16;
+ #[link_name = "llvm.x86.avx2.pmul.dq"]
+ fn pmuldq(a: i32x8, b: i32x8) -> i64x4;
+ #[link_name = "llvm.x86.avx2.pmulu.dq"]
+ fn pmuludq(a: u32x8, b: u32x8) -> u64x4;
+ #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
+ fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
+ #[link_name = "llvm.x86.avx2.packsswb"]
+ fn packsswb(a: i16x16, b: i16x16) -> i8x32;
+ #[link_name = "llvm.x86.avx2.packssdw"]
+ fn packssdw(a: i32x8, b: i32x8) -> i16x16;
+ #[link_name = "llvm.x86.avx2.packuswb"]
+ fn packuswb(a: i16x16, b: i16x16) -> u8x32;
+ #[link_name = "llvm.x86.avx2.packusdw"]
+ fn packusdw(a: i32x8, b: i32x8) -> u16x16;
+ #[link_name = "llvm.x86.avx2.psad.bw"]
+ fn psadbw(a: u8x32, b: u8x32) -> u64x4;
+ #[link_name = "llvm.x86.avx2.psign.b"]
+ fn psignb(a: i8x32, b: i8x32) -> i8x32;
+ #[link_name = "llvm.x86.avx2.psign.w"]
+ fn psignw(a: i16x16, b: i16x16) -> i16x16;
+ #[link_name = "llvm.x86.avx2.psign.d"]
+ fn psignd(a: i32x8, b: i32x8) -> i32x8;
+ #[link_name = "llvm.x86.avx2.psll.w"]
+ fn psllw(a: i16x16, count: i16x8) -> i16x16;
+ #[link_name = "llvm.x86.avx2.psll.d"]
+ fn pslld(a: i32x8, count: i32x4) -> i32x8;
+ #[link_name = "llvm.x86.avx2.psll.q"]
+ fn psllq(a: i64x4, count: i64x2) -> i64x4;
+ #[link_name = "llvm.x86.avx2.pslli.w"]
+ fn pslliw(a: i16x16, imm8: i32) -> i16x16;
+ #[link_name = "llvm.x86.avx2.pslli.d"]
+ fn psllid(a: i32x8, imm8: i32) -> i32x8;
+ #[link_name = "llvm.x86.avx2.pslli.q"]
+ fn pslliq(a: i64x4, imm8: i32) -> i64x4;
+ #[link_name = "llvm.x86.avx2.psllv.d"]
+ fn psllvd(a: i32x4, count: i32x4) -> i32x4;
+ #[link_name = "llvm.x86.avx2.psllv.d.256"]
+ fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
+ #[link_name = "llvm.x86.avx2.psllv.q"]
+ fn psllvq(a: i64x2, count: i64x2) -> i64x2;
+ #[link_name = "llvm.x86.avx2.psllv.q.256"]
+ fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
+ #[link_name = "llvm.x86.avx2.psra.w"]
+ fn psraw(a: i16x16, count: i16x8) -> i16x16;
+ #[link_name = "llvm.x86.avx2.psra.d"]
+ fn psrad(a: i32x8, count: i32x4) -> i32x8;
+ #[link_name = "llvm.x86.avx2.psrai.w"]
+ fn psraiw(a: i16x16, imm8: i32) -> i16x16;
+ #[link_name = "llvm.x86.avx2.psrai.d"]
+ fn psraid(a: i32x8, imm8: i32) -> i32x8;
+ #[link_name = "llvm.x86.avx2.psrav.d"]
+ fn psravd(a: i32x4, count: i32x4) -> i32x4;
+ #[link_name = "llvm.x86.avx2.psrav.d.256"]
+ fn psravd256(a: i32x8, count: i32x8) -> i32x8;
+ #[link_name = "llvm.x86.avx2.psrl.w"]
+ fn psrlw(a: i16x16, count: i16x8) -> i16x16;
+ #[link_name = "llvm.x86.avx2.psrl.d"]
+ fn psrld(a: i32x8, count: i32x4) -> i32x8;
+ #[link_name = "llvm.x86.avx2.psrl.q"]
+ fn psrlq(a: i64x4, count: i64x2) -> i64x4;
+ #[link_name = "llvm.x86.avx2.psrli.w"]
+ fn psrliw(a: i16x16, imm8: i32) -> i16x16;
+ #[link_name = "llvm.x86.avx2.psrli.d"]
+ fn psrlid(a: i32x8, imm8: i32) -> i32x8;
+ #[link_name = "llvm.x86.avx2.psrli.q"]
+ fn psrliq(a: i64x4, imm8: i32) -> i64x4;
+ #[link_name = "llvm.x86.avx2.psrlv.d"]
+ fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
+ #[link_name = "llvm.x86.avx2.psrlv.d.256"]
+ fn psrlvd256(a: i32x8, count: i32x8) -> i32x8;
+ #[link_name = "llvm.x86.avx2.psrlv.q"]
+ fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
+ #[link_name = "llvm.x86.avx2.psrlv.q.256"]
+ fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
+ #[link_name = "llvm.x86.avx2.pshuf.b"]
+ fn pshufb(a: u8x32, b: u8x32) -> u8x32;
+ #[link_name = "llvm.x86.avx2.permd"]
+ fn permd(a: u32x8, b: u32x8) -> u32x8;
+ #[link_name = "llvm.x86.avx2.permps"]
+ fn permps(a: __m256, b: i32x8) -> __m256;
+ #[link_name = "llvm.x86.avx2.vperm2i128"]
+ fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
+ #[link_name = "llvm.x86.avx2.gather.d.d"]
+ fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
+ #[link_name = "llvm.x86.avx2.gather.d.d.256"]
+ fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
+ #[link_name = "llvm.x86.avx2.gather.d.q"]
+ fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
+ #[link_name = "llvm.x86.avx2.gather.d.q.256"]
+ fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
+ #[link_name = "llvm.x86.avx2.gather.q.d"]
+ fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
+ #[link_name = "llvm.x86.avx2.gather.q.d.256"]
+ fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
+ #[link_name = "llvm.x86.avx2.gather.q.q"]
+ fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
+ #[link_name = "llvm.x86.avx2.gather.q.q.256"]
+ fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
+ #[link_name = "llvm.x86.avx2.gather.d.pd"]
+ fn pgatherdpd(
+ src: __m128d,
+ slice: *const i8,
+ offsets: i32x4,
+ mask: __m128d,
+ scale: i8,
+ ) -> __m128d;
+ #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
+ fn vpgatherdpd(
+ src: __m256d,
+ slice: *const i8,
+ offsets: i32x4,
+ mask: __m256d,
+ scale: i8,
+ ) -> __m256d;
+ #[link_name = "llvm.x86.avx2.gather.q.pd"]
+ fn pgatherqpd(
+ src: __m128d,
+ slice: *const i8,
+ offsets: i64x2,
+ mask: __m128d,
+ scale: i8,
+ ) -> __m128d;
+ #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
+ fn vpgatherqpd(
+ src: __m256d,
+ slice: *const i8,
+ offsets: i64x4,
+ mask: __m256d,
+ scale: i8,
+ ) -> __m256d;
+ #[link_name = "llvm.x86.avx2.gather.d.ps"]
+ fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
+ -> __m128;
+ #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
+ fn vpgatherdps(
+ src: __m256,
+ slice: *const i8,
+ offsets: i32x8,
+ mask: __m256,
+ scale: i8,
+ ) -> __m256;
+ #[link_name = "llvm.x86.avx2.gather.q.ps"]
+ fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
+ -> __m128;
+ #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
+ fn vpgatherqps(
+ src: __m128,
+ slice: *const i8,
+ offsets: i64x4,
+ mask: __m128,
+ scale: i8,
+ ) -> __m128;
+ #[link_name = "llvm.x86.avx2.psll.dq"]
+ fn vpslldq(a: i64x4, b: i32) -> i64x4;
+ #[link_name = "llvm.x86.avx2.psrl.dq"]
+ fn vpsrldq(a: i64x4, b: i32) -> i64x4;
+}
+
+#[cfg(test)]
+mod tests {
+
+ use stdarch_test::simd_test;
+
+ use crate::core_arch::x86::*;
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_abs_epi32() {
+ #[rustfmt::skip]
+ let a = _mm256_setr_epi32(
+ 0, 1, -1, i32::MAX,
+ i32::MIN, 100, -100, -32,
+ );
+ let r = _mm256_abs_epi32(a);
+ #[rustfmt::skip]
+ let e = _mm256_setr_epi32(
+ 0, 1, 1, i32::MAX,
+ i32::MAX.wrapping_add(1), 100, 100, 32,
+ );
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_abs_epi16() {
+ #[rustfmt::skip]
+ let a = _mm256_setr_epi16(
+ 0, 1, -1, 2, -2, 3, -3, 4,
+ -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
+ );
+ let r = _mm256_abs_epi16(a);
+ #[rustfmt::skip]
+ let e = _mm256_setr_epi16(
+ 0, 1, 1, 2, 2, 3, 3, 4,
+ 4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
+ );
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_abs_epi8() {
+ #[rustfmt::skip]
+ let a = _mm256_setr_epi8(
+ 0, 1, -1, 2, -2, 3, -3, 4,
+ -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
+ 0, 1, -1, 2, -2, 3, -3, 4,
+ -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
+ );
+ let r = _mm256_abs_epi8(a);
+ #[rustfmt::skip]
+ let e = _mm256_setr_epi8(
+ 0, 1, 1, 2, 2, 3, 3, 4,
+ 4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
+ 0, 1, 1, 2, 2, 3, 3, 4,
+ 4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
+ );
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_add_epi64() {
+ let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
+ let b = _mm256_setr_epi64x(-1, 0, 1, 2);
+ let r = _mm256_add_epi64(a, b);
+ let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_add_epi32() {
+ let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
+ let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+ let r = _mm256_add_epi32(a, b);
+ let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_add_epi16() {
+ #[rustfmt::skip]
+ let a = _mm256_setr_epi16(
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ );
+ #[rustfmt::skip]
+ let b = _mm256_setr_epi16(
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ );
+ let r = _mm256_add_epi16(a, b);
+ #[rustfmt::skip]
+ let e = _mm256_setr_epi16(
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ );
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_add_epi8() {
+ #[rustfmt::skip]
+ let a = _mm256_setr_epi8(
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ );
+ #[rustfmt::skip]
+ let b = _mm256_setr_epi8(
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ );
+ let r = _mm256_add_epi8(a, b);
+ #[rustfmt::skip]
+ let e = _mm256_setr_epi8(
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62,
+ );
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_adds_epi8() {
+ #[rustfmt::skip]
+ let a = _mm256_setr_epi8(
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ );
+ #[rustfmt::skip]
+ let b = _mm256_setr_epi8(
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ );
+ let r = _mm256_adds_epi8(a, b);
+ #[rustfmt::skip]
+ let e = _mm256_setr_epi8(
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 66, 68, 70, 72, 74, 76, 78,
+ 80, 82, 84, 86, 88, 90, 92, 94,
+ );
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_adds_epi8_saturate_positive() {
+ let a = _mm256_set1_epi8(0x7F);
+ let b = _mm256_set1_epi8(1);
+ let r = _mm256_adds_epi8(a, b);
+ assert_eq_m256i(r, a);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_adds_epi8_saturate_negative() {
+ let a = _mm256_set1_epi8(-0x80);
+ let b = _mm256_set1_epi8(-1);
+ let r = _mm256_adds_epi8(a, b);
+ assert_eq_m256i(r, a);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_adds_epi16() {
+ #[rustfmt::skip]
+ let a = _mm256_setr_epi16(
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ );
+ #[rustfmt::skip]
+ let b = _mm256_setr_epi16(
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ );
+ let r = _mm256_adds_epi16(a, b);
+ #[rustfmt::skip]
+ let e = _mm256_setr_epi16(
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62,
+ );
+
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_adds_epi16_saturate_positive() {
+ let a = _mm256_set1_epi16(0x7FFF);
+ let b = _mm256_set1_epi16(1);
+ let r = _mm256_adds_epi16(a, b);
+ assert_eq_m256i(r, a);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_adds_epi16_saturate_negative() {
+ let a = _mm256_set1_epi16(-0x8000);
+ let b = _mm256_set1_epi16(-1);
+ let r = _mm256_adds_epi16(a, b);
+ assert_eq_m256i(r, a);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_adds_epu8() {
+ #[rustfmt::skip]
+ let a = _mm256_setr_epi8(
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ );
+ #[rustfmt::skip]
+ let b = _mm256_setr_epi8(
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ );
+ let r = _mm256_adds_epu8(a, b);
+ #[rustfmt::skip]
+ let e = _mm256_setr_epi8(
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 66, 68, 70, 72, 74, 76, 78,
+ 80, 82, 84, 86, 88, 90, 92, 94,
+ );
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_adds_epu8_saturate() {
+ let a = _mm256_set1_epi8(!0);
+ let b = _mm256_set1_epi8(1);
+ let r = _mm256_adds_epu8(a, b);
+ assert_eq_m256i(r, a);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_adds_epu16() {
+ #[rustfmt::skip]
+ let a = _mm256_setr_epi16(
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ );
+ #[rustfmt::skip]
+ let b = _mm256_setr_epi16(
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ );
+ let r = _mm256_adds_epu16(a, b);
+ #[rustfmt::skip]
+ let e = _mm256_setr_epi16(
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62,
+ );
+
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_adds_epu16_saturate() {
+ let a = _mm256_set1_epi16(!0);
+ let b = _mm256_set1_epi16(1);
+ let r = _mm256_adds_epu16(a, b);
+ assert_eq_m256i(r, a);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_and_si256() {
+ let a = _mm256_set1_epi8(5);
+ let b = _mm256_set1_epi8(3);
+ let got = _mm256_and_si256(a, b);
+ assert_eq_m256i(got, _mm256_set1_epi8(1));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_andnot_si256() {
+ let a = _mm256_set1_epi8(5);
+ let b = _mm256_set1_epi8(3);
+ let got = _mm256_andnot_si256(a, b);
+ assert_eq_m256i(got, _mm256_set1_epi8(2));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_avg_epu8() {
+ let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
+ let r = _mm256_avg_epu8(a, b);
+ assert_eq_m256i(r, _mm256_set1_epi8(6));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_avg_epu16() {
+ let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
+ let r = _mm256_avg_epu16(a, b);
+ assert_eq_m256i(r, _mm256_set1_epi16(6));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_blend_epi32() {
+ let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
+ let e = _mm_setr_epi32(9, 3, 3, 3);
+ let r = _mm_blend_epi32::<0x01>(a, b);
+ assert_eq_m128i(r, e);
+
+ let r = _mm_blend_epi32::<0x0E>(b, a);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_blend_epi32() {
+ let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
+ let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
+ let r = _mm256_blend_epi32::<0x01>(a, b);
+ assert_eq_m256i(r, e);
+
+ let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
+ let r = _mm256_blend_epi32::<0x82>(a, b);
+ assert_eq_m256i(r, e);
+
+ let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
+ let r = _mm256_blend_epi32::<0x7C>(a, b);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_blend_epi16() {
+ let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
+ let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
+ let r = _mm256_blend_epi16::<0x01>(a, b);
+ assert_eq_m256i(r, e);
+
+ let r = _mm256_blend_epi16::<0xFE>(b, a);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_blendv_epi8() {
+ let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
+ let mask = _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), -1);
+ let e = _mm256_insert_epi8::<2>(_mm256_set1_epi8(4), 2);
+ let r = _mm256_blendv_epi8(a, b, mask);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_broadcastb_epi8() {
+ let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
+ let res = _mm_broadcastb_epi8(a);
+ assert_eq_m128i(res, _mm_set1_epi8(0x2a));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_broadcastb_epi8() {
+ let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
+ let res = _mm256_broadcastb_epi8(a);
+ assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_broadcastd_epi32() {
+ let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
+ let res = _mm_broadcastd_epi32(a);
+ assert_eq_m128i(res, _mm_set1_epi32(0x2a));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_broadcastd_epi32() {
+ let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
+ let res = _mm256_broadcastd_epi32(a);
+ assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_broadcastq_epi64() {
+ let a = _mm_setr_epi64x(0x1ffffffff, 0);
+ let res = _mm_broadcastq_epi64(a);
+ assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_broadcastq_epi64() {
+ let a = _mm_setr_epi64x(0x1ffffffff, 0);
+ let res = _mm256_broadcastq_epi64(a);
+ assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_broadcastsd_pd() {
+ let a = _mm_setr_pd(6.28, 3.14);
+ let res = _mm_broadcastsd_pd(a);
+ assert_eq_m128d(res, _mm_set1_pd(6.28f64));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_broadcastsd_pd() {
+ let a = _mm_setr_pd(6.28, 3.14);
+ let res = _mm256_broadcastsd_pd(a);
+ assert_eq_m256d(res, _mm256_set1_pd(6.28f64));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_broadcastsi128_si256() {
+ let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
+ let res = _mm256_broadcastsi128_si256(a);
+ let retval = _mm256_setr_epi64x(
+ 0x0987654321012334,
+ 0x5678909876543210,
+ 0x0987654321012334,
+ 0x5678909876543210,
+ );
+ assert_eq_m256i(res, retval);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_broadcastss_ps() {
+ let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0);
+ let res = _mm_broadcastss_ps(a);
+ assert_eq_m128(res, _mm_set1_ps(6.28f32));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_broadcastss_ps() {
+ let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0);
+ let res = _mm256_broadcastss_ps(a);
+ assert_eq_m256(res, _mm256_set1_ps(6.28f32));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_broadcastw_epi16() {
+ let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
+ let res = _mm_broadcastw_epi16(a);
+ assert_eq_m128i(res, _mm_set1_epi16(0x22b));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_broadcastw_epi16() {
+ let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
+ let res = _mm256_broadcastw_epi16(a);
+ assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cmpeq_epi8() {
+ #[rustfmt::skip]
+ let a = _mm256_setr_epi8(
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ );
+ #[rustfmt::skip]
+ let b = _mm256_setr_epi8(
+ 31, 30, 2, 28, 27, 26, 25, 24,
+ 23, 22, 21, 20, 19, 18, 17, 16,
+ 15, 14, 13, 12, 11, 10, 9, 8,
+ 7, 6, 5, 4, 3, 2, 1, 0,
+ );
+ let r = _mm256_cmpeq_epi8(a, b);
+ assert_eq_m256i(r, _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), !0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cmpeq_epi16() {
+ #[rustfmt::skip]
+ let a = _mm256_setr_epi16(
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ );
+ #[rustfmt::skip]
+ let b = _mm256_setr_epi16(
+ 15, 14, 2, 12, 11, 10, 9, 8,
+ 7, 6, 5, 4, 3, 2, 1, 0,
+ );
+ let r = _mm256_cmpeq_epi16(a, b);
+ assert_eq_m256i(r, _mm256_insert_epi16::<2>(_mm256_set1_epi16(0), !0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cmpeq_epi32() {
+ let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+ let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
+ let r = _mm256_cmpeq_epi32(a, b);
+ let e = _mm256_set1_epi32(0);
+ let e = _mm256_insert_epi32::<2>(e, !0);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cmpeq_epi64() {
+ let a = _mm256_setr_epi64x(0, 1, 2, 3);
+ let b = _mm256_setr_epi64x(3, 2, 2, 0);
+ let r = _mm256_cmpeq_epi64(a, b);
+ assert_eq_m256i(r, _mm256_insert_epi64::<2>(_mm256_set1_epi64x(0), !0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cmpgt_epi8() {
+ let a = _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), 5);
+ let b = _mm256_set1_epi8(0);
+ let r = _mm256_cmpgt_epi8(a, b);
+ assert_eq_m256i(r, _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), !0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cmpgt_epi16() {
+ let a = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 5);
+ let b = _mm256_set1_epi16(0);
+ let r = _mm256_cmpgt_epi16(a, b);
+ assert_eq_m256i(r, _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), !0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cmpgt_epi32() {
+ let a = _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), 5);
+ let b = _mm256_set1_epi32(0);
+ let r = _mm256_cmpgt_epi32(a, b);
+ assert_eq_m256i(r, _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), !0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cmpgt_epi64() {
+ let a = _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), 5);
+ let b = _mm256_set1_epi64x(0);
+ let r = _mm256_cmpgt_epi64(a, b);
+ assert_eq_m256i(r, _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), !0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cvtepi8_epi16() {
+ #[rustfmt::skip]
+ let a = _mm_setr_epi8(
+ 0, 0, -1, 1, -2, 2, -3, 3,
+ -4, 4, -5, 5, -6, 6, -7, 7,
+ );
+ #[rustfmt::skip]
+ let r = _mm256_setr_epi16(
+ 0, 0, -1, 1, -2, 2, -3, 3,
+ -4, 4, -5, 5, -6, 6, -7, 7,
+ );
+ assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cvtepi8_epi32() {
+ #[rustfmt::skip]
+ let a = _mm_setr_epi8(
+ 0, 0, -1, 1, -2, 2, -3, 3,
+ -4, 4, -5, 5, -6, 6, -7, 7,
+ );
+ let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
+ assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cvtepi8_epi64() {
+ #[rustfmt::skip]
+ let a = _mm_setr_epi8(
+ 0, 0, -1, 1, -2, 2, -3, 3,
+ -4, 4, -5, 5, -6, 6, -7, 7,
+ );
+ let r = _mm256_setr_epi64x(0, 0, -1, 1);
+ assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cvtepi16_epi32() {
+ let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
+ let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
+ assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cvtepi16_epi64() {
+ let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
+ let r = _mm256_setr_epi64x(0, 0, -1, 1);
+ assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cvtepi32_epi64() {
+ let a = _mm_setr_epi32(0, 0, -1, 1);
+ let r = _mm256_setr_epi64x(0, 0, -1, 1);
+ assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cvtepu16_epi32() {
+ let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+ let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+ assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cvtepu16_epi64() {
+ let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+ let r = _mm256_setr_epi64x(0, 1, 2, 3);
+ assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cvtepu32_epi64() {
+ let a = _mm_setr_epi32(0, 1, 2, 3);
+ let r = _mm256_setr_epi64x(0, 1, 2, 3);
+ assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cvtepu8_epi16() {
+ #[rustfmt::skip]
+ let a = _mm_setr_epi8(
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ );
+ #[rustfmt::skip]
+ let r = _mm256_setr_epi16(
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ );
+ assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cvtepu8_epi32() {
+ #[rustfmt::skip]
+ let a = _mm_setr_epi8(
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ );
+ let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+ assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cvtepu8_epi64() {
+ #[rustfmt::skip]
+ let a = _mm_setr_epi8(
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ );
+ let r = _mm256_setr_epi64x(0, 1, 2, 3);
+ assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_extracti128_si256() {
+ let a = _mm256_setr_epi64x(1, 2, 3, 4);
+ let r = _mm256_extracti128_si256::<1>(a);
+ let e = _mm_setr_epi64x(3, 4);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_hadd_epi16() {
+ let a = _mm256_set1_epi16(2);
+ let b = _mm256_set1_epi16(4);
+ let r = _mm256_hadd_epi16(a, b);
+ let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_hadd_epi32() {
+ let a = _mm256_set1_epi32(2);
+ let b = _mm256_set1_epi32(4);
+ let r = _mm256_hadd_epi32(a, b);
+ let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_hadds_epi16() {
+ let a = _mm256_set1_epi16(2);
+ let a = _mm256_insert_epi16::<0>(a, 0x7fff);
+ let a = _mm256_insert_epi16::<1>(a, 1);
+ let b = _mm256_set1_epi16(4);
+ let r = _mm256_hadds_epi16(a, b);
+ #[rustfmt::skip]
+ let e = _mm256_setr_epi16(
+ 0x7FFF, 4, 4, 4, 8, 8, 8, 8,
+ 4, 4, 4, 4, 8, 8, 8, 8,
+ );
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_hsub_epi16() {
+ let a = _mm256_set1_epi16(2);
+ let b = _mm256_set1_epi16(4);
+ let r = _mm256_hsub_epi16(a, b);
+ let e = _mm256_set1_epi16(0);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_hsub_epi32() {
+ let a = _mm256_set1_epi32(2);
+ let b = _mm256_set1_epi32(4);
+ let r = _mm256_hsub_epi32(a, b);
+ let e = _mm256_set1_epi32(0);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_hsubs_epi16() {
+ let a = _mm256_set1_epi16(2);
+ let a = _mm256_insert_epi16::<0>(a, 0x7fff);
+ let a = _mm256_insert_epi16::<1>(a, -1);
+ let b = _mm256_set1_epi16(4);
+ let r = _mm256_hsubs_epi16(a, b);
+ let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_madd_epi16() {
+ let a = _mm256_set1_epi16(2);
+ let b = _mm256_set1_epi16(4);
+ let r = _mm256_madd_epi16(a, b);
+ let e = _mm256_set1_epi32(16);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_inserti128_si256() {
+ let a = _mm256_setr_epi64x(1, 2, 3, 4);
+ let b = _mm_setr_epi64x(7, 8);
+ let r = _mm256_inserti128_si256::<1>(a, b);
+ let e = _mm256_setr_epi64x(1, 2, 7, 8);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_maddubs_epi16() {
+ let a = _mm256_set1_epi8(2);
+ let b = _mm256_set1_epi8(4);
+ let r = _mm256_maddubs_epi16(a, b);
+ let e = _mm256_set1_epi16(16);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_maskload_epi32() {
+ let nums = [1, 2, 3, 4];
+ let a = &nums as *const i32;
+ let mask = _mm_setr_epi32(-1, 0, 0, -1);
+ let r = _mm_maskload_epi32(a, mask);
+ let e = _mm_setr_epi32(1, 0, 0, 4);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_maskload_epi32() {
+ let nums = [1, 2, 3, 4, 5, 6, 7, 8];
+ let a = &nums as *const i32;
+ let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
+ let r = _mm256_maskload_epi32(a, mask);
+ let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_maskload_epi64() {
+ let nums = [1_i64, 2_i64];
+ let a = &nums as *const i64;
+ let mask = _mm_setr_epi64x(0, -1);
+ let r = _mm_maskload_epi64(a, mask);
+ let e = _mm_setr_epi64x(0, 2);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_maskload_epi64() {
+ let nums = [1_i64, 2_i64, 3_i64, 4_i64];
+ let a = &nums as *const i64;
+ let mask = _mm256_setr_epi64x(0, -1, -1, 0);
+ let r = _mm256_maskload_epi64(a, mask);
+ let e = _mm256_setr_epi64x(0, 2, 3, 0);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_maskstore_epi32() {
+ let a = _mm_setr_epi32(1, 2, 3, 4);
+ let mut arr = [-1, -1, -1, -1];
+ let mask = _mm_setr_epi32(-1, 0, 0, -1);
+ _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
+ let e = [1, -1, -1, 4];
+ assert_eq!(arr, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_maskstore_epi32() {
+ let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
+ let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
+ let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
+ _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
+ let e = [1, -1, -1, 42, -1, 6, 7, -1];
+ assert_eq!(arr, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_maskstore_epi64() {
+ let a = _mm_setr_epi64x(1_i64, 2_i64);
+ let mut arr = [-1_i64, -1_i64];
+ let mask = _mm_setr_epi64x(0, -1);
+ _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
+ let e = [-1, 2];
+ assert_eq!(arr, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_maskstore_epi64() {
+ let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
+ let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
+ let mask = _mm256_setr_epi64x(0, -1, -1, 0);
+ _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
+ let e = [-1, 2, 3, -1];
+ assert_eq!(arr, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_max_epi16() {
+ let a = _mm256_set1_epi16(2);
+ let b = _mm256_set1_epi16(4);
+ let r = _mm256_max_epi16(a, b);
+ assert_eq_m256i(r, b);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_max_epi32() {
+ let a = _mm256_set1_epi32(2);
+ let b = _mm256_set1_epi32(4);
+ let r = _mm256_max_epi32(a, b);
+ assert_eq_m256i(r, b);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_max_epi8() {
+ let a = _mm256_set1_epi8(2);
+ let b = _mm256_set1_epi8(4);
+ let r = _mm256_max_epi8(a, b);
+ assert_eq_m256i(r, b);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_max_epu16() {
+ let a = _mm256_set1_epi16(2);
+ let b = _mm256_set1_epi16(4);
+ let r = _mm256_max_epu16(a, b);
+ assert_eq_m256i(r, b);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_max_epu32() {
+ let a = _mm256_set1_epi32(2);
+ let b = _mm256_set1_epi32(4);
+ let r = _mm256_max_epu32(a, b);
+ assert_eq_m256i(r, b);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_max_epu8() {
+ let a = _mm256_set1_epi8(2);
+ let b = _mm256_set1_epi8(4);
+ let r = _mm256_max_epu8(a, b);
+ assert_eq_m256i(r, b);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_min_epi16() {
+ let a = _mm256_set1_epi16(2);
+ let b = _mm256_set1_epi16(4);
+ let r = _mm256_min_epi16(a, b);
+ assert_eq_m256i(r, a);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_min_epi32() {
+ let a = _mm256_set1_epi32(2);
+ let b = _mm256_set1_epi32(4);
+ let r = _mm256_min_epi32(a, b);
+ assert_eq_m256i(r, a);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_min_epi8() {
+ let a = _mm256_set1_epi8(2);
+ let b = _mm256_set1_epi8(4);
+ let r = _mm256_min_epi8(a, b);
+ assert_eq_m256i(r, a);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_min_epu16() {
+ let a = _mm256_set1_epi16(2);
+ let b = _mm256_set1_epi16(4);
+ let r = _mm256_min_epu16(a, b);
+ assert_eq_m256i(r, a);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_min_epu32() {
+ let a = _mm256_set1_epi32(2);
+ let b = _mm256_set1_epi32(4);
+ let r = _mm256_min_epu32(a, b);
+ assert_eq_m256i(r, a);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_min_epu8() {
+ let a = _mm256_set1_epi8(2);
+ let b = _mm256_set1_epi8(4);
+ let r = _mm256_min_epu8(a, b);
+ assert_eq_m256i(r, a);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_movemask_epi8() {
+ let a = _mm256_set1_epi8(-1);
+ let r = _mm256_movemask_epi8(a);
+ let e = -1;
+ assert_eq!(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_mpsadbw_epu8() {
+ let a = _mm256_set1_epi8(2);
+ let b = _mm256_set1_epi8(4);
+ let r = _mm256_mpsadbw_epu8::<0>(a, b);
+ let e = _mm256_set1_epi16(8);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_mul_epi32() {
+ let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
+ let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+ let r = _mm256_mul_epi32(a, b);
+ let e = _mm256_setr_epi64x(0, 0, 10, 14);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_mul_epu32() {
+ let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
+ let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+ let r = _mm256_mul_epu32(a, b);
+ let e = _mm256_setr_epi64x(0, 0, 10, 14);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_mulhi_epi16() {
+ let a = _mm256_set1_epi16(6535);
+ let b = _mm256_set1_epi16(6535);
+ let r = _mm256_mulhi_epi16(a, b);
+ let e = _mm256_set1_epi16(651);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_mulhi_epu16() {
+ let a = _mm256_set1_epi16(6535);
+ let b = _mm256_set1_epi16(6535);
+ let r = _mm256_mulhi_epu16(a, b);
+ let e = _mm256_set1_epi16(651);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_mullo_epi16() {
+ let a = _mm256_set1_epi16(2);
+ let b = _mm256_set1_epi16(4);
+ let r = _mm256_mullo_epi16(a, b);
+ let e = _mm256_set1_epi16(8);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_mullo_epi32() {
+ let a = _mm256_set1_epi32(2);
+ let b = _mm256_set1_epi32(4);
+ let r = _mm256_mullo_epi32(a, b);
+ let e = _mm256_set1_epi32(8);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_mulhrs_epi16() {
+ let a = _mm256_set1_epi16(2);
+ let b = _mm256_set1_epi16(4);
+ let r = _mm256_mullo_epi16(a, b);
+ let e = _mm256_set1_epi16(8);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_or_si256() {
+ let a = _mm256_set1_epi8(-1);
+ let b = _mm256_set1_epi8(0);
+ let r = _mm256_or_si256(a, b);
+ assert_eq_m256i(r, a);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_packs_epi16() {
+ let a = _mm256_set1_epi16(2);
+ let b = _mm256_set1_epi16(4);
+ let r = _mm256_packs_epi16(a, b);
+ #[rustfmt::skip]
+ let e = _mm256_setr_epi8(
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ );
+
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_packs_epi32() {
+ let a = _mm256_set1_epi32(2);
+ let b = _mm256_set1_epi32(4);
+ let r = _mm256_packs_epi32(a, b);
+ let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
+
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_packus_epi16() {
+ let a = _mm256_set1_epi16(2);
+ let b = _mm256_set1_epi16(4);
+ let r = _mm256_packus_epi16(a, b);
+ #[rustfmt::skip]
+ let e = _mm256_setr_epi8(
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ );
+
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_packus_epi32() {
+ let a = _mm256_set1_epi32(2);
+ let b = _mm256_set1_epi32(4);
+ let r = _mm256_packus_epi32(a, b);
+ let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
+
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_sad_epu8() {
+ let a = _mm256_set1_epi8(2);
+ let b = _mm256_set1_epi8(4);
+ let r = _mm256_sad_epu8(a, b);
+ let e = _mm256_set1_epi64x(16);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_shufflehi_epi16() {
+ #[rustfmt::skip]
+ let a = _mm256_setr_epi16(
+ 0, 1, 2, 3, 11, 22, 33, 44,
+ 4, 5, 6, 7, 55, 66, 77, 88,
+ );
+ #[rustfmt::skip]
+ let e = _mm256_setr_epi16(
+ 0, 1, 2, 3, 44, 22, 22, 11,
+ 4, 5, 6, 7, 88, 66, 66, 55,
+ );
+ let r = _mm256_shufflehi_epi16::<0b00_01_01_11>(a);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_shufflelo_epi16() {
+ #[rustfmt::skip]
+ let a = _mm256_setr_epi16(
+ 11, 22, 33, 44, 0, 1, 2, 3,
+ 55, 66, 77, 88, 4, 5, 6, 7,
+ );
+ #[rustfmt::skip]
+ let e = _mm256_setr_epi16(
+ 44, 22, 22, 11, 0, 1, 2, 3,
+ 88, 66, 66, 55, 4, 5, 6, 7,
+ );
+ let r = _mm256_shufflelo_epi16::<0b00_01_01_11>(a);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_sign_epi16() {
+ let a = _mm256_set1_epi16(2);
+ let b = _mm256_set1_epi16(-1);
+ let r = _mm256_sign_epi16(a, b);
+ let e = _mm256_set1_epi16(-2);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_sign_epi32() {
+ let a = _mm256_set1_epi32(2);
+ let b = _mm256_set1_epi32(-1);
+ let r = _mm256_sign_epi32(a, b);
+ let e = _mm256_set1_epi32(-2);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_sign_epi8() {
+ let a = _mm256_set1_epi8(2);
+ let b = _mm256_set1_epi8(-1);
+ let r = _mm256_sign_epi8(a, b);
+ let e = _mm256_set1_epi8(-2);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_sll_epi16() {
+ let a = _mm256_set1_epi16(0xFF);
+ let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
+ let r = _mm256_sll_epi16(a, b);
+ assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_sll_epi32() {
+ let a = _mm256_set1_epi32(0xFFFF);
+ let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
+ let r = _mm256_sll_epi32(a, b);
+ assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_sll_epi64() {
+ let a = _mm256_set1_epi64x(0xFFFFFFFF);
+ let b = _mm_insert_epi64::<0>(_mm_set1_epi64x(0), 4);
+ let r = _mm256_sll_epi64(a, b);
+ assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_slli_epi16() {
+ assert_eq_m256i(
+ _mm256_slli_epi16::<4>(_mm256_set1_epi16(0xFF)),
+ _mm256_set1_epi16(0xFF0),
+ );
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_slli_epi32() {
+ assert_eq_m256i(
+ _mm256_slli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
+ _mm256_set1_epi32(0xFFFF0),
+ );
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_slli_epi64() {
+ assert_eq_m256i(
+ _mm256_slli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
+ _mm256_set1_epi64x(0xFFFFFFFF0),
+ );
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_slli_si256() {
+ let a = _mm256_set1_epi64x(0xFFFFFFFF);
+ let r = _mm256_slli_si256::<3>(a);
+ assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_sllv_epi32() {
+ let a = _mm_set1_epi32(2);
+ let b = _mm_set1_epi32(1);
+ let r = _mm_sllv_epi32(a, b);
+ let e = _mm_set1_epi32(4);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_sllv_epi32() {
+ let a = _mm256_set1_epi32(2);
+ let b = _mm256_set1_epi32(1);
+ let r = _mm256_sllv_epi32(a, b);
+ let e = _mm256_set1_epi32(4);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_sllv_epi64() {
+ let a = _mm_set1_epi64x(2);
+ let b = _mm_set1_epi64x(1);
+ let r = _mm_sllv_epi64(a, b);
+ let e = _mm_set1_epi64x(4);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_sllv_epi64() {
+ let a = _mm256_set1_epi64x(2);
+ let b = _mm256_set1_epi64x(1);
+ let r = _mm256_sllv_epi64(a, b);
+ let e = _mm256_set1_epi64x(4);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_sra_epi16() {
+ let a = _mm256_set1_epi16(-1);
+ let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+ let r = _mm256_sra_epi16(a, b);
+ assert_eq_m256i(r, _mm256_set1_epi16(-1));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_sra_epi32() {
+ let a = _mm256_set1_epi32(-1);
+ let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 1);
+ let r = _mm256_sra_epi32(a, b);
+ assert_eq_m256i(r, _mm256_set1_epi32(-1));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_srai_epi16() {
+ assert_eq_m256i(
+ _mm256_srai_epi16::<1>(_mm256_set1_epi16(-1)),
+ _mm256_set1_epi16(-1),
+ );
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_srai_epi32() {
+ assert_eq_m256i(
+ _mm256_srai_epi32::<1>(_mm256_set1_epi32(-1)),
+ _mm256_set1_epi32(-1),
+ );
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_srav_epi32() {
+ let a = _mm_set1_epi32(4);
+ let count = _mm_set1_epi32(1);
+ let r = _mm_srav_epi32(a, count);
+ let e = _mm_set1_epi32(2);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_srav_epi32() {
+ let a = _mm256_set1_epi32(4);
+ let count = _mm256_set1_epi32(1);
+ let r = _mm256_srav_epi32(a, count);
+ let e = _mm256_set1_epi32(2);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_srli_si256() {
+ #[rustfmt::skip]
+ let a = _mm256_setr_epi8(
+ 1, 2, 3, 4, 5, 6, 7, 8,
+ 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24,
+ 25, 26, 27, 28, 29, 30, 31, 32,
+ );
+ let r = _mm256_srli_si256::<3>(a);
+ #[rustfmt::skip]
+ let e = _mm256_setr_epi8(
+ 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15, 16, 0, 0, 0,
+ 20, 21, 22, 23, 24, 25, 26, 27,
+ 28, 29, 30, 31, 32, 0, 0, 0,
+ );
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_srl_epi16() {
+ let a = _mm256_set1_epi16(0xFF);
+ let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
+ let r = _mm256_srl_epi16(a, b);
+ assert_eq_m256i(r, _mm256_set1_epi16(0xF));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_srl_epi32() {
+ let a = _mm256_set1_epi32(0xFFFF);
+ let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
+ let r = _mm256_srl_epi32(a, b);
+ assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_srl_epi64() {
+ let a = _mm256_set1_epi64x(0xFFFFFFFF);
+ let b = _mm_setr_epi64x(4, 0);
+ let r = _mm256_srl_epi64(a, b);
+ assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_srli_epi16() {
+ assert_eq_m256i(
+ _mm256_srli_epi16::<4>(_mm256_set1_epi16(0xFF)),
+ _mm256_set1_epi16(0xF),
+ );
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_srli_epi32() {
+ assert_eq_m256i(
+ _mm256_srli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
+ _mm256_set1_epi32(0xFFF),
+ );
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_srli_epi64() {
+ assert_eq_m256i(
+ _mm256_srli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
+ _mm256_set1_epi64x(0xFFFFFFF),
+ );
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_srlv_epi32() {
+ let a = _mm_set1_epi32(2);
+ let count = _mm_set1_epi32(1);
+ let r = _mm_srlv_epi32(a, count);
+ let e = _mm_set1_epi32(1);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_srlv_epi32() {
+ let a = _mm256_set1_epi32(2);
+ let count = _mm256_set1_epi32(1);
+ let r = _mm256_srlv_epi32(a, count);
+ let e = _mm256_set1_epi32(1);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_srlv_epi64() {
+ let a = _mm_set1_epi64x(2);
+ let count = _mm_set1_epi64x(1);
+ let r = _mm_srlv_epi64(a, count);
+ let e = _mm_set1_epi64x(1);
+ assert_eq_m128i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_srlv_epi64() {
+ let a = _mm256_set1_epi64x(2);
+ let count = _mm256_set1_epi64x(1);
+ let r = _mm256_srlv_epi64(a, count);
+ let e = _mm256_set1_epi64x(1);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_sub_epi16() {
+ let a = _mm256_set1_epi16(4);
+ let b = _mm256_set1_epi16(2);
+ let r = _mm256_sub_epi16(a, b);
+ assert_eq_m256i(r, b);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_sub_epi32() {
+ let a = _mm256_set1_epi32(4);
+ let b = _mm256_set1_epi32(2);
+ let r = _mm256_sub_epi32(a, b);
+ assert_eq_m256i(r, b);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_sub_epi64() {
+ let a = _mm256_set1_epi64x(4);
+ let b = _mm256_set1_epi64x(2);
+ let r = _mm256_sub_epi64(a, b);
+ assert_eq_m256i(r, b);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_sub_epi8() {
+ let a = _mm256_set1_epi8(4);
+ let b = _mm256_set1_epi8(2);
+ let r = _mm256_sub_epi8(a, b);
+ assert_eq_m256i(r, b);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_subs_epi16() {
+ let a = _mm256_set1_epi16(4);
+ let b = _mm256_set1_epi16(2);
+ let r = _mm256_subs_epi16(a, b);
+ assert_eq_m256i(r, b);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_subs_epi8() {
+ let a = _mm256_set1_epi8(4);
+ let b = _mm256_set1_epi8(2);
+ let r = _mm256_subs_epi8(a, b);
+ assert_eq_m256i(r, b);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_subs_epu16() {
+ let a = _mm256_set1_epi16(4);
+ let b = _mm256_set1_epi16(2);
+ let r = _mm256_subs_epu16(a, b);
+ assert_eq_m256i(r, b);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_subs_epu8() {
+ let a = _mm256_set1_epi8(4);
+ let b = _mm256_set1_epi8(2);
+ let r = _mm256_subs_epu8(a, b);
+ assert_eq_m256i(r, b);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_xor_si256() {
+ let a = _mm256_set1_epi8(5);
+ let b = _mm256_set1_epi8(3);
+ let r = _mm256_xor_si256(a, b);
+ assert_eq_m256i(r, _mm256_set1_epi8(6));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_alignr_epi8() {
+ #[rustfmt::skip]
+ let a = _mm256_setr_epi8(
+ 1, 2, 3, 4, 5, 6, 7, 8,
+ 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24,
+ 25, 26, 27, 28, 29, 30, 31, 32,
+ );
+ #[rustfmt::skip]
+ let b = _mm256_setr_epi8(
+ -1, -2, -3, -4, -5, -6, -7, -8,
+ -9, -10, -11, -12, -13, -14, -15, -16,
+ -17, -18, -19, -20, -21, -22, -23, -24,
+ -25, -26, -27, -28, -29, -30, -31, -32,
+ );
+ let r = _mm256_alignr_epi8::<33>(a, b);
+ assert_eq_m256i(r, _mm256_set1_epi8(0));
+
+ let r = _mm256_alignr_epi8::<17>(a, b);
+ #[rustfmt::skip]
+ let expected = _mm256_setr_epi8(
+ 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 16, 0,
+ 18, 19, 20, 21, 22, 23, 24, 25,
+ 26, 27, 28, 29, 30, 31, 32, 0,
+ );
+ assert_eq_m256i(r, expected);
+
+ let r = _mm256_alignr_epi8::<4>(a, b);
+ #[rustfmt::skip]
+ let expected = _mm256_setr_epi8(
+ -5, -6, -7, -8, -9, -10, -11, -12,
+ -13, -14, -15, -16, 1, 2, 3, 4,
+ -21, -22, -23, -24, -25, -26, -27, -28,
+ -29, -30, -31, -32, 17, 18, 19, 20,
+ );
+ assert_eq_m256i(r, expected);
+
+ #[rustfmt::skip]
+ let expected = _mm256_setr_epi8(
+ -1, -2, -3, -4, -5, -6, -7, -8,
+ -9, -10, -11, -12, -13, -14, -15, -16, -17,
+ -18, -19, -20, -21, -22, -23, -24, -25,
+ -26, -27, -28, -29, -30, -31, -32,
+ );
+ let r = _mm256_alignr_epi8::<16>(a, b);
+ assert_eq_m256i(r, expected);
+
+ let r = _mm256_alignr_epi8::<15>(a, b);
+ #[rustfmt::skip]
+ let expected = _mm256_setr_epi8(
+ -16, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ -32, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ );
+ assert_eq_m256i(r, expected);
+
+ let r = _mm256_alignr_epi8::<0>(a, b);
+ assert_eq_m256i(r, b);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_shuffle_epi8() {
+ #[rustfmt::skip]
+ let a = _mm256_setr_epi8(
+ 1, 2, 3, 4, 5, 6, 7, 8,
+ 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24,
+ 25, 26, 27, 28, 29, 30, 31, 32,
+ );
+ #[rustfmt::skip]
+ let b = _mm256_setr_epi8(
+ 4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
+ 12, 5, 5, 10, 4, 1, 8, 0,
+ 4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
+ 12, 5, 5, 10, 4, 1, 8, 0,
+ );
+ #[rustfmt::skip]
+ let expected = _mm256_setr_epi8(
+ 5, 0, 5, 4, 9, 13, 7, 4,
+ 13, 6, 6, 11, 5, 2, 9, 1,
+ 21, 0, 21, 20, 25, 29, 23, 20,
+ 29, 22, 22, 27, 21, 18, 25, 17,
+ );
+ let r = _mm256_shuffle_epi8(a, b);
+ assert_eq_m256i(r, expected);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_permutevar8x32_epi32() {
+ let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
+ let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
+ let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
+ let r = _mm256_permutevar8x32_epi32(a, b);
+ assert_eq_m256i(r, expected);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_permute4x64_epi64() {
+ let a = _mm256_setr_epi64x(100, 200, 300, 400);
+ let expected = _mm256_setr_epi64x(400, 100, 200, 100);
+ let r = _mm256_permute4x64_epi64::<0b00010011>(a);
+ assert_eq_m256i(r, expected);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_permute2x128_si256() {
+ let a = _mm256_setr_epi64x(100, 200, 500, 600);
+ let b = _mm256_setr_epi64x(300, 400, 700, 800);
+ let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
+ let e = _mm256_setr_epi64x(700, 800, 500, 600);
+ assert_eq_m256i(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_permute4x64_pd() {
+ let a = _mm256_setr_pd(1., 2., 3., 4.);
+ let r = _mm256_permute4x64_pd::<0b00_01_00_11>(a);
+ let e = _mm256_setr_pd(4., 1., 2., 1.);
+ assert_eq_m256d(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_permutevar8x32_ps() {
+ let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+ let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
+ let r = _mm256_permutevar8x32_ps(a, b);
+ let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
+ assert_eq_m256(r, e);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_i32gather_epi32() {
+ let mut arr = [0i32; 128];
+ for i in 0..128i32 {
+ arr[i as usize] = i;
+ }
+ // A multiplier of 4 is word-addressing
+ let r = _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+ assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_mask_i32gather_epi32() {
+ let mut arr = [0i32; 128];
+ for i in 0..128i32 {
+ arr[i as usize] = i;
+ }
+ // A multiplier of 4 is word-addressing
+ let r = _mm_mask_i32gather_epi32::<4>(
+ _mm_set1_epi32(256),
+ arr.as_ptr(),
+ _mm_setr_epi32(0, 16, 64, 96),
+ _mm_setr_epi32(-1, -1, -1, 0),
+ );
+ assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_i32gather_epi32() {
+ let mut arr = [0i32; 128];
+ for i in 0..128i32 {
+ arr[i as usize] = i;
+ }
+ // A multiplier of 4 is word-addressing
+ let r =
+ _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+ assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_mask_i32gather_epi32() {
+ let mut arr = [0i32; 128];
+ for i in 0..128i32 {
+ arr[i as usize] = i;
+ }
+ // A multiplier of 4 is word-addressing
+ let r = _mm256_mask_i32gather_epi32::<4>(
+ _mm256_set1_epi32(256),
+ arr.as_ptr(),
+ _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
+ _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
+ );
+ assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_i32gather_ps() {
+ let mut arr = [0.0f32; 128];
+ let mut j = 0.0;
+ for i in 0..128usize {
+ arr[i] = j;
+ j += 1.0;
+ }
+ // A multiplier of 4 is word-addressing for f32s
+ let r = _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+ assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_mask_i32gather_ps() {
+ let mut arr = [0.0f32; 128];
+ let mut j = 0.0;
+ for i in 0..128usize {
+ arr[i] = j;
+ j += 1.0;
+ }
+ // A multiplier of 4 is word-addressing for f32s
+ let r = _mm_mask_i32gather_ps::<4>(
+ _mm_set1_ps(256.0),
+ arr.as_ptr(),
+ _mm_setr_epi32(0, 16, 64, 96),
+ _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
+ );
+ assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_i32gather_ps() {
+ let mut arr = [0.0f32; 128];
+ let mut j = 0.0;
+ for i in 0..128usize {
+ arr[i] = j;
+ j += 1.0;
+ }
+ // A multiplier of 4 is word-addressing for f32s
+ let r =
+ _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+ assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_mask_i32gather_ps() {
+ let mut arr = [0.0f32; 128];
+ let mut j = 0.0;
+ for i in 0..128usize {
+ arr[i] = j;
+ j += 1.0;
+ }
+ // A multiplier of 4 is word-addressing for f32s
+ let r = _mm256_mask_i32gather_ps::<4>(
+ _mm256_set1_ps(256.0),
+ arr.as_ptr(),
+ _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
+ _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
+ );
+ assert_eq_m256(
+ r,
+ _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
+ );
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_i32gather_epi64() {
+ let mut arr = [0i64; 128];
+ for i in 0..128i64 {
+ arr[i as usize] = i;
+ }
+ // A multiplier of 8 is word-addressing for i64s
+ let r = _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
+ assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_mask_i32gather_epi64() {
+ let mut arr = [0i64; 128];
+ for i in 0..128i64 {
+ arr[i as usize] = i;
+ }
+ // A multiplier of 8 is word-addressing for i64s
+ let r = _mm_mask_i32gather_epi64::<8>(
+ _mm_set1_epi64x(256),
+ arr.as_ptr(),
+ _mm_setr_epi32(16, 16, 16, 16),
+ _mm_setr_epi64x(-1, 0),
+ );
+ assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_i32gather_epi64() {
+ let mut arr = [0i64; 128];
+ for i in 0..128i64 {
+ arr[i as usize] = i;
+ }
+ // A multiplier of 8 is word-addressing for i64s
+ let r = _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+ assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_mask_i32gather_epi64() {
+ let mut arr = [0i64; 128];
+ for i in 0..128i64 {
+ arr[i as usize] = i;
+ }
+ // A multiplier of 8 is word-addressing for i64s
+ let r = _mm256_mask_i32gather_epi64::<8>(
+ _mm256_set1_epi64x(256),
+ arr.as_ptr(),
+ _mm_setr_epi32(0, 16, 64, 96),
+ _mm256_setr_epi64x(-1, -1, -1, 0),
+ );
+ assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_i32gather_pd() {
+ let mut arr = [0.0f64; 128];
+ let mut j = 0.0;
+ for i in 0..128usize {
+ arr[i] = j;
+ j += 1.0;
+ }
+ // A multiplier of 8 is word-addressing for f64s
+ let r = _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
+ assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_mask_i32gather_pd() {
+ let mut arr = [0.0f64; 128];
+ let mut j = 0.0;
+ for i in 0..128usize {
+ arr[i] = j;
+ j += 1.0;
+ }
+ // A multiplier of 8 is word-addressing for f64s
+ let r = _mm_mask_i32gather_pd::<8>(
+ _mm_set1_pd(256.0),
+ arr.as_ptr(),
+ _mm_setr_epi32(16, 16, 16, 16),
+ _mm_setr_pd(-1.0, 0.0),
+ );
+ assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_i32gather_pd() {
+ let mut arr = [0.0f64; 128];
+ let mut j = 0.0;
+ for i in 0..128usize {
+ arr[i] = j;
+ j += 1.0;
+ }
+ // A multiplier of 8 is word-addressing for f64s
+ let r = _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+ assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_mask_i32gather_pd() {
+ let mut arr = [0.0f64; 128];
+ let mut j = 0.0;
+ for i in 0..128usize {
+ arr[i] = j;
+ j += 1.0;
+ }
+ // A multiplier of 8 is word-addressing for f64s
+ let r = _mm256_mask_i32gather_pd::<8>(
+ _mm256_set1_pd(256.0),
+ arr.as_ptr(),
+ _mm_setr_epi32(0, 16, 64, 96),
+ _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
+ );
+ assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_i64gather_epi32() {
+ let mut arr = [0i32; 128];
+ for i in 0..128i32 {
+ arr[i as usize] = i;
+ }
+ // A multiplier of 4 is word-addressing
+ let r = _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+ assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_mask_i64gather_epi32() {
+ let mut arr = [0i32; 128];
+ for i in 0..128i32 {
+ arr[i as usize] = i;
+ }
+ // A multiplier of 4 is word-addressing
+ let r = _mm_mask_i64gather_epi32::<4>(
+ _mm_set1_epi32(256),
+ arr.as_ptr(),
+ _mm_setr_epi64x(0, 16),
+ _mm_setr_epi32(-1, 0, -1, 0),
+ );
+ assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_i64gather_epi32() {
+ let mut arr = [0i32; 128];
+ for i in 0..128i32 {
+ arr[i as usize] = i;
+ }
+ // A multiplier of 4 is word-addressing
+ let r = _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+ assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_mask_i64gather_epi32() {
+ let mut arr = [0i32; 128];
+ for i in 0..128i32 {
+ arr[i as usize] = i;
+ }
+ // A multiplier of 4 is word-addressing
+ let r = _mm256_mask_i64gather_epi32::<4>(
+ _mm_set1_epi32(256),
+ arr.as_ptr(),
+ _mm256_setr_epi64x(0, 16, 64, 96),
+ _mm_setr_epi32(-1, -1, -1, 0),
+ );
+ assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_i64gather_ps() {
+ let mut arr = [0.0f32; 128];
+ let mut j = 0.0;
+ for i in 0..128usize {
+ arr[i] = j;
+ j += 1.0;
+ }
+ // A multiplier of 4 is word-addressing for f32s
+ let r = _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+ assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_mask_i64gather_ps() {
+ let mut arr = [0.0f32; 128];
+ let mut j = 0.0;
+ for i in 0..128usize {
+ arr[i] = j;
+ j += 1.0;
+ }
+ // A multiplier of 4 is word-addressing for f32s
+ let r = _mm_mask_i64gather_ps::<4>(
+ _mm_set1_ps(256.0),
+ arr.as_ptr(),
+ _mm_setr_epi64x(0, 16),
+ _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
+ );
+ assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_i64gather_ps() {
+ let mut arr = [0.0f32; 128];
+ let mut j = 0.0;
+ for i in 0..128usize {
+ arr[i] = j;
+ j += 1.0;
+ }
+ // A multiplier of 4 is word-addressing for f32s
+ let r = _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+ assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_mask_i64gather_ps() {
+ let mut arr = [0.0f32; 128];
+ let mut j = 0.0;
+ for i in 0..128usize {
+ arr[i] = j;
+ j += 1.0;
+ }
+ // A multiplier of 4 is word-addressing for f32s
+ let r = _mm256_mask_i64gather_ps::<4>(
+ _mm_set1_ps(256.0),
+ arr.as_ptr(),
+ _mm256_setr_epi64x(0, 16, 64, 96),
+ _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
+ );
+ assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_i64gather_epi64() {
+ let mut arr = [0i64; 128];
+ for i in 0..128i64 {
+ arr[i as usize] = i;
+ }
+ // A multiplier of 8 is word-addressing for i64s
+ let r = _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+ assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_mask_i64gather_epi64() {
+ let mut arr = [0i64; 128];
+ for i in 0..128i64 {
+ arr[i as usize] = i;
+ }
+ // A multiplier of 8 is word-addressing for i64s
+ let r = _mm_mask_i64gather_epi64::<8>(
+ _mm_set1_epi64x(256),
+ arr.as_ptr(),
+ _mm_setr_epi64x(16, 16),
+ _mm_setr_epi64x(-1, 0),
+ );
+ assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_i64gather_epi64() {
+ let mut arr = [0i64; 128];
+ for i in 0..128i64 {
+ arr[i as usize] = i;
+ }
+ // A multiplier of 8 is word-addressing for i64s
+ let r = _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+ assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_mask_i64gather_epi64() {
+ let mut arr = [0i64; 128];
+ for i in 0..128i64 {
+ arr[i as usize] = i;
+ }
+ // A multiplier of 8 is word-addressing for i64s
+ let r = _mm256_mask_i64gather_epi64::<8>(
+ _mm256_set1_epi64x(256),
+ arr.as_ptr(),
+ _mm256_setr_epi64x(0, 16, 64, 96),
+ _mm256_setr_epi64x(-1, -1, -1, 0),
+ );
+ assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_i64gather_pd() {
+ let mut arr = [0.0f64; 128];
+ let mut j = 0.0;
+ for i in 0..128usize {
+ arr[i] = j;
+ j += 1.0;
+ }
+ // A multiplier of 8 is word-addressing for f64s
+ let r = _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+ assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm_mask_i64gather_pd() {
+ let mut arr = [0.0f64; 128];
+ let mut j = 0.0;
+ for i in 0..128usize {
+ arr[i] = j;
+ j += 1.0;
+ }
+ // A multiplier of 8 is word-addressing for f64s
+ let r = _mm_mask_i64gather_pd::<8>(
+ _mm_set1_pd(256.0),
+ arr.as_ptr(),
+ _mm_setr_epi64x(16, 16),
+ _mm_setr_pd(-1.0, 0.0),
+ );
+ assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_i64gather_pd() {
+ let mut arr = [0.0f64; 128];
+ let mut j = 0.0;
+ for i in 0..128usize {
+ arr[i] = j;
+ j += 1.0;
+ }
+ // A multiplier of 8 is word-addressing for f64s
+ let r = _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+ assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_mask_i64gather_pd() {
+ let mut arr = [0.0f64; 128];
+ let mut j = 0.0;
+ for i in 0..128usize {
+ arr[i] = j;
+ j += 1.0;
+ }
+ // A multiplier of 8 is word-addressing for f64s
+ let r = _mm256_mask_i64gather_pd::<8>(
+ _mm256_set1_pd(256.0),
+ arr.as_ptr(),
+ _mm256_setr_epi64x(0, 16, 64, 96),
+ _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
+ );
+ assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
+ }
+
+ #[simd_test(enable = "avx")]
+ unsafe fn test_mm256_extract_epi8() {
+ #[rustfmt::skip]
+ let a = _mm256_setr_epi8(
+ -1, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31
+ );
+ let r1 = _mm256_extract_epi8::<0>(a);
+ let r2 = _mm256_extract_epi8::<3>(a);
+ assert_eq!(r1, 0xFF);
+ assert_eq!(r2, 3);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_extract_epi16() {
+ #[rustfmt::skip]
+ let a = _mm256_setr_epi16(
+ -1, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ );
+ let r1 = _mm256_extract_epi16::<0>(a);
+ let r2 = _mm256_extract_epi16::<3>(a);
+ assert_eq!(r1, 0xFFFF);
+ assert_eq!(r2, 3);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_extract_epi32() {
+ let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
+ let r1 = _mm256_extract_epi32::<0>(a);
+ let r2 = _mm256_extract_epi32::<3>(a);
+ assert_eq!(r1, -1);
+ assert_eq!(r2, 3);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cvtsd_f64() {
+ let a = _mm256_setr_pd(1., 2., 3., 4.);
+ let r = _mm256_cvtsd_f64(a);
+ assert_eq!(r, 1.);
+ }
+
+ #[simd_test(enable = "avx2")]
+ unsafe fn test_mm256_cvtsi256_si32() {
+ let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+ let r = _mm256_cvtsi256_si32(a);
+ assert_eq!(r, 1);
+ }
+}