diff options
Diffstat (limited to '')
-rw-r--r-- | library/stdarch/crates/core_arch/src/x86/ssse3.rs | 537 |
1 files changed, 537 insertions, 0 deletions
diff --git a/library/stdarch/crates/core_arch/src/x86/ssse3.rs b/library/stdarch/crates/core_arch/src/x86/ssse3.rs new file mode 100644 index 000000000..4beb496b6 --- /dev/null +++ b/library/stdarch/crates/core_arch/src/x86/ssse3.rs @@ -0,0 +1,537 @@ +//! Supplemental Streaming SIMD Extensions 3 (SSSE3) + +use crate::{ + core_arch::{simd::*, simd_llvm::*, x86::*}, + mem::transmute, +}; + +#[cfg(test)] +use stdarch_test::assert_instr; + +/// Computes the absolute value of packed 8-bit signed integers in `a` and +/// return the unsigned results. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8) +#[inline] +#[target_feature(enable = "ssse3")] +#[cfg_attr(test, assert_instr(pabsb))] +#[stable(feature = "simd_x86", since = "1.27.0")] +pub unsafe fn _mm_abs_epi8(a: __m128i) -> __m128i { + transmute(pabsb128(a.as_i8x16())) +} + +/// Computes the absolute value of each of the packed 16-bit signed integers in +/// `a` and +/// return the 16-bit unsigned integer +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16) +#[inline] +#[target_feature(enable = "ssse3")] +#[cfg_attr(test, assert_instr(pabsw))] +#[stable(feature = "simd_x86", since = "1.27.0")] +pub unsafe fn _mm_abs_epi16(a: __m128i) -> __m128i { + transmute(pabsw128(a.as_i16x8())) +} + +/// Computes the absolute value of each of the packed 32-bit signed integers in +/// `a` and +/// return the 32-bit unsigned integer +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32) +#[inline] +#[target_feature(enable = "ssse3")] +#[cfg_attr(test, assert_instr(pabsd))] +#[stable(feature = "simd_x86", since = "1.27.0")] +pub unsafe fn _mm_abs_epi32(a: __m128i) -> __m128i { + transmute(pabsd128(a.as_i32x4())) +} + +/// Shuffles bytes from `a` according to the content of `b`. +/// +/// The last 4 bits of each byte of `b` are used as addresses +/// into the 16 bytes of `a`. +/// +/// In addition, if the highest significant bit of a byte of `b` +/// is set, the respective destination byte is set to 0. +/// +/// Picturing `a` and `b` as `[u8; 16]`, `_mm_shuffle_epi8` is +/// logically equivalent to: +/// +/// ``` +/// fn mm_shuffle_epi8(a: [u8; 16], b: [u8; 16]) -> [u8; 16] { +/// let mut r = [0u8; 16]; +/// for i in 0..16 { +/// // if the most significant bit of b is set, +/// // then the destination byte is set to 0. +/// if b[i] & 0x80 == 0u8 { +/// r[i] = a[(b[i] % 16) as usize]; +/// } +/// } +/// r +/// } +/// ``` +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8) +#[inline] +#[target_feature(enable = "ssse3")] +#[cfg_attr(test, assert_instr(pshufb))] +#[stable(feature = "simd_x86", since = "1.27.0")] +pub unsafe fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i { + transmute(pshufb128(a.as_u8x16(), b.as_u8x16())) +} + +/// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result, +/// shift the result right by `n` bytes, and returns the low 16 bytes. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8) +#[inline] +#[target_feature(enable = "ssse3")] +#[cfg_attr(test, assert_instr(palignr, IMM8 = 15))] +#[rustc_legacy_const_generics(2)] +#[stable(feature = "simd_x86", since = "1.27.0")] +pub unsafe fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i { + static_assert_imm8!(IMM8); + // If palignr is shifting the pair of vectors more than the size of two + // lanes, emit zero. + if IMM8 > 32 { + return _mm_set1_epi8(0); + } + // If palignr is shifting the pair of input vectors more than one lane, + // but less than two lanes, convert to shifting in zeroes. + let (a, b) = if IMM8 > 16 { + (_mm_set1_epi8(0), a) + } else { + (a, b) + }; + const fn mask(shift: u32, i: u32) -> u32 { + if shift > 32 { + // Unused, but needs to be a valid index. + i + } else if shift > 16 { + shift - 16 + i + } else { + shift + i + } + } + let r: i8x16 = simd_shuffle16!( + b.as_i8x16(), + a.as_i8x16(), + <const IMM8: i32> [ + mask(IMM8 as u32, 0), + mask(IMM8 as u32, 1), + mask(IMM8 as u32, 2), + mask(IMM8 as u32, 3), + mask(IMM8 as u32, 4), + mask(IMM8 as u32, 5), + mask(IMM8 as u32, 6), + mask(IMM8 as u32, 7), + mask(IMM8 as u32, 8), + mask(IMM8 as u32, 9), + mask(IMM8 as u32, 10), + mask(IMM8 as u32, 11), + mask(IMM8 as u32, 12), + mask(IMM8 as u32, 13), + mask(IMM8 as u32, 14), + mask(IMM8 as u32, 15), + ], + ); + transmute(r) +} + +/// Horizontally adds the adjacent pairs of values contained in 2 packed +/// 128-bit vectors of `[8 x i16]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_epi16) +#[inline] +#[target_feature(enable = "ssse3")] +#[cfg_attr(test, assert_instr(phaddw))] +#[stable(feature = "simd_x86", since = "1.27.0")] +pub unsafe fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i { + transmute(phaddw128(a.as_i16x8(), b.as_i16x8())) +} + +/// Horizontally adds the adjacent pairs of values contained in 2 packed +/// 128-bit vectors of `[8 x i16]`. Positive sums greater than 7FFFh are +/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_epi16) +#[inline] +#[target_feature(enable = "ssse3")] +#[cfg_attr(test, assert_instr(phaddsw))] +#[stable(feature = "simd_x86", since = "1.27.0")] +pub unsafe fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i { + transmute(phaddsw128(a.as_i16x8(), b.as_i16x8())) +} + +/// Horizontally adds the adjacent pairs of values contained in 2 packed +/// 128-bit vectors of `[4 x i32]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_epi32) +#[inline] +#[target_feature(enable = "ssse3")] +#[cfg_attr(test, assert_instr(phaddd))] +#[stable(feature = "simd_x86", since = "1.27.0")] +pub unsafe fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i { + transmute(phaddd128(a.as_i32x4(), b.as_i32x4())) +} + +/// Horizontally subtract the adjacent pairs of values contained in 2 +/// packed 128-bit vectors of `[8 x i16]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi16) +#[inline] +#[target_feature(enable = "ssse3")] +#[cfg_attr(test, assert_instr(phsubw))] +#[stable(feature = "simd_x86", since = "1.27.0")] +pub unsafe fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i { + transmute(phsubw128(a.as_i16x8(), b.as_i16x8())) +} + +/// Horizontally subtract the adjacent pairs of values contained in 2 +/// packed 128-bit vectors of `[8 x i16]`. Positive differences greater than +/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are +/// saturated to 8000h. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16) +#[inline] +#[target_feature(enable = "ssse3")] +#[cfg_attr(test, assert_instr(phsubsw))] +#[stable(feature = "simd_x86", since = "1.27.0")] +pub unsafe fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i { + transmute(phsubsw128(a.as_i16x8(), b.as_i16x8())) +} + +/// Horizontally subtract the adjacent pairs of values contained in 2 +/// packed 128-bit vectors of `[4 x i32]`. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi32) +#[inline] +#[target_feature(enable = "ssse3")] +#[cfg_attr(test, assert_instr(phsubd))] +#[stable(feature = "simd_x86", since = "1.27.0")] +pub unsafe fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i { + transmute(phsubd128(a.as_i32x4(), b.as_i32x4())) +} + +/// Multiplies corresponding pairs of packed 8-bit unsigned integer +/// values contained in the first source operand and packed 8-bit signed +/// integer values contained in the second source operand, add pairs of +/// contiguous products with signed saturation, and writes the 16-bit sums to +/// the corresponding bits in the destination. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_epi16) +#[inline] +#[target_feature(enable = "ssse3")] +#[cfg_attr(test, assert_instr(pmaddubsw))] +#[stable(feature = "simd_x86", since = "1.27.0")] +pub unsafe fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i { + transmute(pmaddubsw128(a.as_u8x16(), b.as_i8x16())) +} + +/// Multiplies packed 16-bit signed integer values, truncate the 32-bit +/// product to the 18 most significant bits by right-shifting, round the +/// truncated value by adding 1, and write bits `[16:1]` to the destination. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_epi16) +#[inline] +#[target_feature(enable = "ssse3")] +#[cfg_attr(test, assert_instr(pmulhrsw))] +#[stable(feature = "simd_x86", since = "1.27.0")] +pub unsafe fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i { + transmute(pmulhrsw128(a.as_i16x8(), b.as_i16x8())) +} + +/// Negates packed 8-bit integers in `a` when the corresponding signed 8-bit +/// integer in `b` is negative, and returns the result. +/// Elements in result are zeroed out when the corresponding element in `b` +/// is zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_epi8) +#[inline] +#[target_feature(enable = "ssse3")] +#[cfg_attr(test, assert_instr(psignb))] +#[stable(feature = "simd_x86", since = "1.27.0")] +pub unsafe fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i { + transmute(psignb128(a.as_i8x16(), b.as_i8x16())) +} + +/// Negates packed 16-bit integers in `a` when the corresponding signed 16-bit +/// integer in `b` is negative, and returns the results. +/// Elements in result are zeroed out when the corresponding element in `b` +/// is zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_epi16) +#[inline] +#[target_feature(enable = "ssse3")] +#[cfg_attr(test, assert_instr(psignw))] +#[stable(feature = "simd_x86", since = "1.27.0")] +pub unsafe fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i { + transmute(psignw128(a.as_i16x8(), b.as_i16x8())) +} + +/// Negates packed 32-bit integers in `a` when the corresponding signed 32-bit +/// integer in `b` is negative, and returns the results. +/// Element in result are zeroed out when the corresponding element in `b` +/// is zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_epi32) +#[inline] +#[target_feature(enable = "ssse3")] +#[cfg_attr(test, assert_instr(psignd))] +#[stable(feature = "simd_x86", since = "1.27.0")] +pub unsafe fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i { + transmute(psignd128(a.as_i32x4(), b.as_i32x4())) +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.ssse3.pabs.b.128"] + fn pabsb128(a: i8x16) -> u8x16; + + #[link_name = "llvm.x86.ssse3.pabs.w.128"] + fn pabsw128(a: i16x8) -> u16x8; + + #[link_name = "llvm.x86.ssse3.pabs.d.128"] + fn pabsd128(a: i32x4) -> u32x4; + + #[link_name = "llvm.x86.ssse3.pshuf.b.128"] + fn pshufb128(a: u8x16, b: u8x16) -> u8x16; + + #[link_name = "llvm.x86.ssse3.phadd.w.128"] + fn phaddw128(a: i16x8, b: i16x8) -> i16x8; + + #[link_name = "llvm.x86.ssse3.phadd.sw.128"] + fn phaddsw128(a: i16x8, b: i16x8) -> i16x8; + + #[link_name = "llvm.x86.ssse3.phadd.d.128"] + fn phaddd128(a: i32x4, b: i32x4) -> i32x4; + + #[link_name = "llvm.x86.ssse3.phsub.w.128"] + fn phsubw128(a: i16x8, b: i16x8) -> i16x8; + + #[link_name = "llvm.x86.ssse3.phsub.sw.128"] + fn phsubsw128(a: i16x8, b: i16x8) -> i16x8; + + #[link_name = "llvm.x86.ssse3.phsub.d.128"] + fn phsubd128(a: i32x4, b: i32x4) -> i32x4; + + #[link_name = "llvm.x86.ssse3.pmadd.ub.sw.128"] + fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8; + + #[link_name = "llvm.x86.ssse3.pmul.hr.sw.128"] + fn pmulhrsw128(a: i16x8, b: i16x8) -> i16x8; + + #[link_name = "llvm.x86.ssse3.psign.b.128"] + fn psignb128(a: i8x16, b: i8x16) -> i8x16; + + #[link_name = "llvm.x86.ssse3.psign.w.128"] + fn psignw128(a: i16x8, b: i16x8) -> i16x8; + + #[link_name = "llvm.x86.ssse3.psign.d.128"] + fn psignd128(a: i32x4, b: i32x4) -> i32x4; +} + +#[cfg(test)] +mod tests { + use stdarch_test::simd_test; + + use crate::core_arch::x86::*; + + #[simd_test(enable = "ssse3")] + unsafe fn test_mm_abs_epi8() { + let r = _mm_abs_epi8(_mm_set1_epi8(-5)); + assert_eq_m128i(r, _mm_set1_epi8(5)); + } + + #[simd_test(enable = "ssse3")] + unsafe fn test_mm_abs_epi16() { + let r = _mm_abs_epi16(_mm_set1_epi16(-5)); + assert_eq_m128i(r, _mm_set1_epi16(5)); + } + + #[simd_test(enable = "ssse3")] + unsafe fn test_mm_abs_epi32() { + let r = _mm_abs_epi32(_mm_set1_epi32(-5)); + assert_eq_m128i(r, _mm_set1_epi32(5)); + } + + #[simd_test(enable = "ssse3")] + unsafe fn test_mm_shuffle_epi8() { + #[rustfmt::skip] + let a = _mm_setr_epi8( + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + ); + #[rustfmt::skip] + let b = _mm_setr_epi8( + 4, 128_u8 as i8, 4, 3, + 24, 12, 6, 19, + 12, 5, 5, 10, + 4, 1, 8, 0, + ); + let expected = _mm_setr_epi8(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1); + let r = _mm_shuffle_epi8(a, b); + assert_eq_m128i(r, expected); + } + + #[simd_test(enable = "ssse3")] + unsafe fn test_mm_alignr_epi8() { + #[rustfmt::skip] + let a = _mm_setr_epi8( + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + ); + #[rustfmt::skip] + let b = _mm_setr_epi8( + 4, 63, 4, 3, + 24, 12, 6, 19, + 12, 5, 5, 10, + 4, 1, 8, 0, + ); + let r = _mm_alignr_epi8::<33>(a, b); + assert_eq_m128i(r, _mm_set1_epi8(0)); + + let r = _mm_alignr_epi8::<17>(a, b); + #[rustfmt::skip] + let expected = _mm_setr_epi8( + 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 0, + ); + assert_eq_m128i(r, expected); + + let r = _mm_alignr_epi8::<16>(a, b); + assert_eq_m128i(r, a); + + let r = _mm_alignr_epi8::<15>(a, b); + #[rustfmt::skip] + let expected = _mm_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + ); + assert_eq_m128i(r, expected); + + let r = _mm_alignr_epi8::<0>(a, b); + assert_eq_m128i(r, b); + } + + #[simd_test(enable = "ssse3")] + unsafe fn test_mm_hadd_epi16() { + let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm_setr_epi16(4, 128, 4, 3, 24, 12, 6, 19); + let expected = _mm_setr_epi16(3, 7, 11, 15, 132, 7, 36, 25); + let r = _mm_hadd_epi16(a, b); + assert_eq_m128i(r, expected); + } + + #[simd_test(enable = "ssse3")] + unsafe fn test_mm_hadds_epi16() { + let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm_setr_epi16(4, 128, 4, 3, 32767, 1, -32768, -1); + let expected = _mm_setr_epi16(3, 7, 11, 15, 132, 7, 32767, -32768); + let r = _mm_hadds_epi16(a, b); + assert_eq_m128i(r, expected); + } + + #[simd_test(enable = "ssse3")] + unsafe fn test_mm_hadd_epi32() { + let a = _mm_setr_epi32(1, 2, 3, 4); + let b = _mm_setr_epi32(4, 128, 4, 3); + let expected = _mm_setr_epi32(3, 7, 132, 7); + let r = _mm_hadd_epi32(a, b); + assert_eq_m128i(r, expected); + } + + #[simd_test(enable = "ssse3")] + unsafe fn test_mm_hsub_epi16() { + let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm_setr_epi16(4, 128, 4, 3, 24, 12, 6, 19); + let expected = _mm_setr_epi16(-1, -1, -1, -1, -124, 1, 12, -13); + let r = _mm_hsub_epi16(a, b); + assert_eq_m128i(r, expected); + } + + #[simd_test(enable = "ssse3")] + unsafe fn test_mm_hsubs_epi16() { + let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm_setr_epi16(4, 128, 4, 3, 32767, -1, -32768, 1); + let expected = _mm_setr_epi16(-1, -1, -1, -1, -124, 1, 32767, -32768); + let r = _mm_hsubs_epi16(a, b); + assert_eq_m128i(r, expected); + } + + #[simd_test(enable = "ssse3")] + unsafe fn test_mm_hsub_epi32() { + let a = _mm_setr_epi32(1, 2, 3, 4); + let b = _mm_setr_epi32(4, 128, 4, 3); + let expected = _mm_setr_epi32(-1, -1, -124, 1); + let r = _mm_hsub_epi32(a, b); + assert_eq_m128i(r, expected); + } + + #[simd_test(enable = "ssse3")] + unsafe fn test_mm_maddubs_epi16() { + #[rustfmt::skip] + let a = _mm_setr_epi8( + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + ); + #[rustfmt::skip] + let b = _mm_setr_epi8( + 4, 63, 4, 3, + 24, 12, 6, 19, + 12, 5, 5, 10, + 4, 1, 8, 0, + ); + let expected = _mm_setr_epi16(130, 24, 192, 194, 158, 175, 66, 120); + let r = _mm_maddubs_epi16(a, b); + assert_eq_m128i(r, expected); + } + + #[simd_test(enable = "ssse3")] + unsafe fn test_mm_mulhrs_epi16() { + let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm_setr_epi16(4, 128, 4, 3, 32767, -1, -32768, 1); + let expected = _mm_setr_epi16(0, 0, 0, 0, 5, 0, -7, 0); + let r = _mm_mulhrs_epi16(a, b); + assert_eq_m128i(r, expected); + } + + #[simd_test(enable = "ssse3")] + unsafe fn test_mm_sign_epi8() { + #[rustfmt::skip] + let a = _mm_setr_epi8( + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, -14, -15, 16, + ); + #[rustfmt::skip] + let b = _mm_setr_epi8( + 4, 63, -4, 3, 24, 12, -6, -19, + 12, 5, -5, 10, 4, 1, -8, 0, + ); + #[rustfmt::skip] + let expected = _mm_setr_epi8( + 1, 2, -3, 4, 5, 6, -7, -8, + 9, 10, -11, 12, 13, -14, 15, 0, + ); + let r = _mm_sign_epi8(a, b); + assert_eq_m128i(r, expected); + } + + #[simd_test(enable = "ssse3")] + unsafe fn test_mm_sign_epi16() { + let a = _mm_setr_epi16(1, 2, 3, 4, -5, -6, 7, 8); + let b = _mm_setr_epi16(4, 128, 0, 3, 1, -1, -2, 1); + let expected = _mm_setr_epi16(1, 2, 0, 4, -5, 6, -7, 8); + let r = _mm_sign_epi16(a, b); + assert_eq_m128i(r, expected); + } + + #[simd_test(enable = "ssse3")] + unsafe fn test_mm_sign_epi32() { + let a = _mm_setr_epi32(-1, 2, 3, 4); + let b = _mm_setr_epi32(1, -1, 1, 0); + let expected = _mm_setr_epi32(-1, -2, 3, 0); + let r = _mm_sign_epi32(a, b); + assert_eq_m128i(r, expected); + } +} |