use crate::soft::{x2, x4}; use crate::types::*; use crate::vec128_storage; use crate::x86_64::Avx2Machine; use crate::x86_64::SseMachine as Machine86; use crate::x86_64::{NoS3, NoS4, YesS3, YesS4}; use core::arch::x86_64::*; use core::marker::PhantomData; use core::ops::{ Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not, }; macro_rules! impl_binop { ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => { impl $trait for $vec { type Output = Self; #[inline(always)] fn $fn(self, rhs: Self) -> Self::Output { Self::new(unsafe { $impl_fn(self.x, rhs.x) }) } } }; } macro_rules! impl_binop_assign { ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => { impl $trait for $vec where $vec: Copy, { #[inline(always)] fn $fn_assign(&mut self, rhs: Self) { *self = self.$fn(rhs); } } }; } macro_rules! def_vec { ($vec:ident, $word:ident) => { #[allow(non_camel_case_types)] #[derive(Copy, Clone)] pub struct $vec { x: __m128i, s3: PhantomData, s4: PhantomData, ni: PhantomData, } impl Store for $vec { #[inline(always)] unsafe fn unpack(x: vec128_storage) -> Self { Self::new(x.sse2) } } impl From<$vec> for vec128_storage { #[inline(always)] fn from(x: $vec) -> Self { vec128_storage { sse2: x.x } } } impl $vec { #[inline(always)] fn new(x: __m128i) -> Self { $vec { x, s3: PhantomData, s4: PhantomData, ni: PhantomData, } } } impl StoreBytes for $vec where Self: BSwap, { #[inline(always)] unsafe fn unsafe_read_le(input: &[u8]) -> Self { assert_eq!(input.len(), 16); Self::new(_mm_loadu_si128(input.as_ptr() as *const _)) } #[inline(always)] unsafe fn unsafe_read_be(input: &[u8]) -> Self { assert_eq!(input.len(), 16); Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap() } #[inline(always)] fn write_le(self, out: &mut [u8]) { assert_eq!(out.len(), 16); unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) } } #[inline(always)] fn write_be(self, out: &mut [u8]) { assert_eq!(out.len(), 16); let x = self.bswap().x; unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, x); } } } impl Default for $vec { #[inline(always)] fn default() -> Self { Self::new(unsafe { _mm_setzero_si128() }) } } impl Not for $vec { type Output = Self; #[inline(always)] fn not(self) -> Self::Output { unsafe { let ff = _mm_set1_epi64x(-1i64); self ^ Self::new(ff) } } } impl BitOps0 for $vec {} impl_binop!($vec, BitAnd, bitand, _mm_and_si128); impl_binop!($vec, BitOr, bitor, _mm_or_si128); impl_binop!($vec, BitXor, bitxor, _mm_xor_si128); impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand); impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor); impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor); impl AndNot for $vec { type Output = Self; #[inline(always)] fn andnot(self, rhs: Self) -> Self { Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) }) } } }; } macro_rules! impl_bitops32 { ($vec:ident) => { impl BitOps32 for $vec where $vec: RotateEachWord32 { } }; } macro_rules! impl_bitops64 { ($vec:ident) => { impl_bitops32!($vec); impl BitOps64 for $vec where $vec: RotateEachWord64 + RotateEachWord32 { } }; } macro_rules! impl_bitops128 { ($vec:ident) => { impl_bitops64!($vec); impl BitOps128 for $vec where $vec: RotateEachWord128 { } }; } macro_rules! rotr_32_s3 { ($name:ident, $k0:expr, $k1:expr) => { #[inline(always)] fn $name(self) -> Self { Self::new(unsafe { _mm_shuffle_epi8( self.x, _mm_set_epi64x($k0, $k1), ) }) } }; } macro_rules! rotr_32 { ($name:ident, $i:expr) => { #[inline(always)] fn $name(self) -> Self { Self::new(unsafe { _mm_or_si128( _mm_srli_epi32(self.x, $i as i32), _mm_slli_epi32(self.x, 32 - $i as i32), ) }) } }; } impl RotateEachWord32 for u32x4_sse2 { rotr_32!(rotate_each_word_right7, 7); rotr_32_s3!( rotate_each_word_right8, 0x0c0f0e0d_080b0a09, 0x04070605_00030201 ); rotr_32!(rotate_each_word_right11, 11); rotr_32!(rotate_each_word_right12, 12); rotr_32_s3!( rotate_each_word_right16, 0x0d0c0f0e_09080b0a, 0x05040706_01000302 ); rotr_32!(rotate_each_word_right20, 20); rotr_32_s3!( rotate_each_word_right24, 0x0e0d0c0f_0a09080b, 0x06050407_02010003 ); rotr_32!(rotate_each_word_right25, 25); } impl RotateEachWord32 for u32x4_sse2 { rotr_32!(rotate_each_word_right7, 7); rotr_32!(rotate_each_word_right8, 8); rotr_32!(rotate_each_word_right11, 11); rotr_32!(rotate_each_word_right12, 12); #[inline(always)] fn rotate_each_word_right16(self) -> Self { Self::new(swap16_s2(self.x)) } rotr_32!(rotate_each_word_right20, 20); rotr_32!(rotate_each_word_right24, 24); rotr_32!(rotate_each_word_right25, 25); } macro_rules! rotr_64_s3 { ($name:ident, $k0:expr, $k1:expr) => { #[inline(always)] fn $name(self) -> Self { Self::new(unsafe { _mm_shuffle_epi8( self.x, _mm_set_epi64x($k0, $k1), ) }) } }; } macro_rules! rotr_64 { ($name:ident, $i:expr) => { #[inline(always)] fn $name(self) -> Self { Self::new(unsafe { _mm_or_si128( _mm_srli_epi64(self.x, $i as i32), _mm_slli_epi64(self.x, 64 - $i as i32), ) }) } }; } impl RotateEachWord32 for u64x2_sse2 { rotr_64!(rotate_each_word_right7, 7); rotr_64_s3!( rotate_each_word_right8, 0x080f_0e0d_0c0b_0a09, 0x0007_0605_0403_0201 ); rotr_64!(rotate_each_word_right11, 11); rotr_64!(rotate_each_word_right12, 12); rotr_64_s3!( rotate_each_word_right16, 0x0908_0f0e_0d0c_0b0a, 0x0100_0706_0504_0302 ); rotr_64!(rotate_each_word_right20, 20); rotr_64_s3!( rotate_each_word_right24, 0x0a09_080f_0e0d_0c0b, 0x0201_0007_0605_0403 ); rotr_64!(rotate_each_word_right25, 25); } impl RotateEachWord32 for u64x2_sse2 { rotr_64!(rotate_each_word_right7, 7); rotr_64!(rotate_each_word_right8, 8); rotr_64!(rotate_each_word_right11, 11); rotr_64!(rotate_each_word_right12, 12); #[inline(always)] fn rotate_each_word_right16(self) -> Self { Self::new(swap16_s2(self.x)) } rotr_64!(rotate_each_word_right20, 20); rotr_64!(rotate_each_word_right24, 24); rotr_64!(rotate_each_word_right25, 25); } impl RotateEachWord64 for u64x2_sse2 { #[inline(always)] fn rotate_each_word_right32(self) -> Self { Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) }) } } macro_rules! rotr_128 { ($name:ident, $i:expr) => { #[inline(always)] fn $name(self) -> Self { Self::new(unsafe { _mm_or_si128( _mm_srli_si128(self.x, $i as i32), _mm_slli_si128(self.x, 128 - $i as i32), ) }) } }; } // TODO: completely unoptimized impl RotateEachWord32 for u128x1_sse2 { rotr_128!(rotate_each_word_right7, 7); rotr_128!(rotate_each_word_right8, 8); rotr_128!(rotate_each_word_right11, 11); rotr_128!(rotate_each_word_right12, 12); rotr_128!(rotate_each_word_right16, 16); rotr_128!(rotate_each_word_right20, 20); rotr_128!(rotate_each_word_right24, 24); rotr_128!(rotate_each_word_right25, 25); } // TODO: completely unoptimized impl RotateEachWord64 for u128x1_sse2 { rotr_128!(rotate_each_word_right32, 32); } impl RotateEachWord128 for u128x1_sse2 {} def_vec!(u32x4_sse2, u32); def_vec!(u64x2_sse2, u64); def_vec!(u128x1_sse2, u128); impl MultiLane<[u32; 4]> for u32x4_sse2 { #[inline(always)] fn to_lanes(self) -> [u32; 4] { unsafe { let x = _mm_cvtsi128_si64(self.x) as u64; let y = _mm_extract_epi64(self.x, 1) as u64; [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32] } } #[inline(always)] fn from_lanes(xs: [u32; 4]) -> Self { unsafe { let mut x = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64); x = _mm_insert_epi64(x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1); Self::new(x) } } } impl MultiLane<[u32; 4]> for u32x4_sse2 { #[inline(always)] fn to_lanes(self) -> [u32; 4] { unsafe { let x = _mm_cvtsi128_si64(self.x) as u64; let y = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64; [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32] } } #[inline(always)] fn from_lanes(xs: [u32; 4]) -> Self { unsafe { let x = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64; let y = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64; let x = _mm_cvtsi64_si128(x); let y = _mm_slli_si128(_mm_cvtsi64_si128(y), 8); Self::new(_mm_or_si128(x, y)) } } } impl MultiLane<[u64; 2]> for u64x2_sse2 { #[inline(always)] fn to_lanes(self) -> [u64; 2] { unsafe { [ _mm_cvtsi128_si64(self.x) as u64, _mm_extract_epi64(self.x, 1) as u64, ] } } #[inline(always)] fn from_lanes(xs: [u64; 2]) -> Self { unsafe { let mut x = _mm_cvtsi64_si128(xs[0] as i64); x = _mm_insert_epi64(x, xs[1] as i64, 1); Self::new(x) } } } impl MultiLane<[u64; 2]> for u64x2_sse2 { #[inline(always)] fn to_lanes(self) -> [u64; 2] { unsafe { [ _mm_cvtsi128_si64(self.x) as u64, _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64, ] } } #[inline(always)] fn from_lanes(xs: [u64; 2]) -> Self { unsafe { let x = _mm_cvtsi64_si128(xs[0] as i64); let y = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8); Self::new(_mm_or_si128(x, y)) } } } impl MultiLane<[u128; 1]> for u128x1_sse2 { #[inline(always)] fn to_lanes(self) -> [u128; 1] { unimplemented!() } #[inline(always)] fn from_lanes(xs: [u128; 1]) -> Self { unimplemented!() } } impl MultiLane<[u64; 4]> for u64x4_sse2 where u64x2_sse2: MultiLane<[u64; 2]> + Copy, { #[inline(always)] fn to_lanes(self) -> [u64; 4] { let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes()); [a[0], a[1], b[0], b[1]] } #[inline(always)] fn from_lanes(xs: [u64; 4]) -> Self { let (a, b) = ( u64x2_sse2::from_lanes([xs[0], xs[1]]), u64x2_sse2::from_lanes([xs[2], xs[3]]), ); x2::new([a, b]) } } macro_rules! impl_into { ($from:ident, $to:ident) => { impl From<$from> for $to { #[inline(always)] fn from(x: $from) -> Self { $to::new(x.x) } } }; } impl_into!(u128x1_sse2, u32x4_sse2); impl_into!(u128x1_sse2, u64x2_sse2); impl_bitops32!(u32x4_sse2); impl_bitops64!(u64x2_sse2); impl_bitops128!(u128x1_sse2); impl ArithOps for u32x4_sse2 where u32x4_sse2: BSwap { } impl ArithOps for u64x2_sse2 where u64x2_sse2: BSwap { } impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32); impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64); impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add); impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add); impl u32x4> for u32x4_sse2 where u32x4_sse2: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4, Machine86: Machine, { } impl u64x2> for u64x2_sse2 where u64x2_sse2: RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2, Machine86: Machine, { } impl u128x1> for u128x1_sse2 where u128x1_sse2: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap, Machine86: Machine, u128x1_sse2: Into< as Machine>::u32x4>, u128x1_sse2: Into< as Machine>::u64x2>, { } impl u32x4> for u32x4_sse2 where u32x4_sse2: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4, Machine86: Machine, { } impl u64x2> for u64x2_sse2 where u64x2_sse2: RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2, Machine86: Machine, { } impl u128x1> for u128x1_sse2 where u128x1_sse2: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap, Machine86: Machine, u128x1_sse2: Into< as Machine>::u32x4>, u128x1_sse2: Into< as Machine>::u64x2>, { } impl UnsafeFrom<[u32; 4]> for u32x4_sse2 { #[inline(always)] unsafe fn unsafe_from(xs: [u32; 4]) -> Self { Self::new(_mm_set_epi32( xs[3] as i32, xs[2] as i32, xs[1] as i32, xs[0] as i32, )) } } impl Vec4 for u32x4_sse2 where Self: MultiLane<[u32; 4]>, { #[inline(always)] fn extract(self, i: u32) -> u32 { self.to_lanes()[i as usize] } #[inline(always)] fn insert(self, v: u32, i: u32) -> Self { Self::new(unsafe { match i { 0 => _mm_insert_epi32(self.x, v as i32, 0), 1 => _mm_insert_epi32(self.x, v as i32, 1), 2 => _mm_insert_epi32(self.x, v as i32, 2), 3 => _mm_insert_epi32(self.x, v as i32, 3), _ => unreachable!(), } }) } } impl Vec4 for u32x4_sse2 where Self: MultiLane<[u32; 4]>, { #[inline(always)] fn extract(self, i: u32) -> u32 { self.to_lanes()[i as usize] } #[inline(always)] fn insert(self, v: u32, i: u32) -> Self { Self::new(unsafe { match i { 0 => { let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x); _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)) } 1 => { let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000); x = _mm_slli_si128(x, 4); x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)); _mm_shuffle_epi32(x, 0b1110_0001) } 2 => { let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100); x = _mm_slli_si128(x, 4); x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)); _mm_shuffle_epi32(x, 0b1100_1001) } 3 => { let mut x = _mm_slli_si128(self.x, 4); x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)); _mm_shuffle_epi32(x, 0b0011_1001) } _ => unreachable!(), } }) } } impl LaneWords4 for u32x4_sse2 { #[inline(always)] fn shuffle_lane_words2301(self) -> Self { self.shuffle2301() } #[inline(always)] fn shuffle_lane_words1230(self) -> Self { self.shuffle1230() } #[inline(always)] fn shuffle_lane_words3012(self) -> Self { self.shuffle3012() } } impl Words4 for u32x4_sse2 { #[inline(always)] fn shuffle2301(self) -> Self { Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) }) } #[inline(always)] fn shuffle1230(self) -> Self { Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) }) } #[inline(always)] fn shuffle3012(self) -> Self { Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) }) } } impl Words4 for u64x4_sse2 { #[inline(always)] fn shuffle2301(self) -> Self { x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)]) } #[inline(always)] fn shuffle3012(self) -> Self { unsafe { x2::new([ u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)), u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)), ]) } } #[inline(always)] fn shuffle1230(self) -> Self { unsafe { x2::new([ u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)), u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)), ]) } } } impl Words4 for u64x4_sse2 { #[inline(always)] fn shuffle2301(self) -> Self { x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)]) } #[inline(always)] fn shuffle3012(self) -> Self { unsafe { let a = _mm_srli_si128(self.0[0].x, 8); let b = _mm_slli_si128(self.0[0].x, 8); let c = _mm_srli_si128(self.0[1].x, 8); let d = _mm_slli_si128(self.0[1].x, 8); let da = _mm_or_si128(d, a); let bc = _mm_or_si128(b, c); x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)]) } } #[inline(always)] fn shuffle1230(self) -> Self { unsafe { let a = _mm_srli_si128(self.0[0].x, 8); let b = _mm_slli_si128(self.0[0].x, 8); let c = _mm_srli_si128(self.0[1].x, 8); let d = _mm_slli_si128(self.0[1].x, 8); let da = _mm_or_si128(d, a); let bc = _mm_or_si128(b, c); x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)]) } } } impl UnsafeFrom<[u64; 2]> for u64x2_sse2 { #[inline(always)] unsafe fn unsafe_from(xs: [u64; 2]) -> Self { Self::new(_mm_set_epi64x(xs[1] as i64, xs[0] as i64)) } } impl Vec2 for u64x2_sse2 { #[inline(always)] fn extract(self, i: u32) -> u64 { unsafe { match i { 0 => _mm_cvtsi128_si64(self.x) as u64, 1 => _mm_extract_epi64(self.x, 1) as u64, _ => unreachable!(), } } } #[inline(always)] fn insert(self, x: u64, i: u32) -> Self { Self::new(unsafe { match i { 0 => _mm_insert_epi64(self.x, x as i64, 0), 1 => _mm_insert_epi64(self.x, x as i64, 1), _ => unreachable!(), } }) } } impl Vec2 for u64x2_sse2 { #[inline(always)] fn extract(self, i: u32) -> u64 { unsafe { match i { 0 => _mm_cvtsi128_si64(self.x) as u64, 1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64, _ => unreachable!(), } } } #[inline(always)] fn insert(self, x: u64, i: u32) -> Self { Self::new(unsafe { match i { 0 => _mm_or_si128( _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x), _mm_cvtsi64_si128(x as i64), ), 1 => _mm_or_si128( _mm_move_epi64(self.x), _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8), ), _ => unreachable!(), } }) } } impl BSwap for u32x4_sse2 { #[inline(always)] fn bswap(self) -> Self { Self::new(unsafe { let k = _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203); _mm_shuffle_epi8(self.x, k) }) } } #[inline(always)] fn bswap32_s2(x: __m128i) -> __m128i { unsafe { let mut y = _mm_unpacklo_epi8(x, _mm_setzero_si128()); y = _mm_shufflehi_epi16(y, 0b0001_1011); y = _mm_shufflelo_epi16(y, 0b0001_1011); let mut z = _mm_unpackhi_epi8(x, _mm_setzero_si128()); z = _mm_shufflehi_epi16(z, 0b0001_1011); z = _mm_shufflelo_epi16(z, 0b0001_1011); _mm_packus_epi16(y, z) } } impl BSwap for u32x4_sse2 { #[inline(always)] fn bswap(self) -> Self { Self::new(bswap32_s2(self.x)) } } impl BSwap for u64x2_sse2 { #[inline(always)] fn bswap(self) -> Self { Self::new(unsafe { let k = _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607); _mm_shuffle_epi8(self.x, k) }) } } impl BSwap for u64x2_sse2 { #[inline(always)] fn bswap(self) -> Self { Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) }) } } impl BSwap for u128x1_sse2 { #[inline(always)] fn bswap(self) -> Self { Self::new(unsafe { let k = _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100); _mm_shuffle_epi8(self.x, k) }) } } impl BSwap for u128x1_sse2 { #[inline(always)] fn bswap(self) -> Self { Self::new(unsafe { unimplemented!() }) } } macro_rules! swapi { ($x:expr, $i:expr, $k:expr) => { unsafe { const K: u8 = $k; let k = _mm_set1_epi8(K as i8); u128x1_sse2::new(_mm_or_si128( _mm_srli_epi16(_mm_and_si128($x.x, k), $i), _mm_and_si128(_mm_slli_epi16($x.x, $i), k), )) } }; } #[inline(always)] fn swap16_s2(x: __m128i) -> __m128i { unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) } } impl Swap64 for u128x1_sse2 { #[inline(always)] fn swap1(self) -> Self { swapi!(self, 1, 0xaa) } #[inline(always)] fn swap2(self) -> Self { swapi!(self, 2, 0xcc) } #[inline(always)] fn swap4(self) -> Self { swapi!(self, 4, 0xf0) } #[inline(always)] fn swap8(self) -> Self { u128x1_sse2::new(unsafe { let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001); _mm_shuffle_epi8(self.x, k) }) } #[inline(always)] fn swap16(self) -> Self { u128x1_sse2::new(unsafe { let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302); _mm_shuffle_epi8(self.x, k) }) } #[inline(always)] fn swap32(self) -> Self { u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) }) } #[inline(always)] fn swap64(self) -> Self { u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) }) } } impl Swap64 for u128x1_sse2 { #[inline(always)] fn swap1(self) -> Self { swapi!(self, 1, 0xaa) } #[inline(always)] fn swap2(self) -> Self { swapi!(self, 2, 0xcc) } #[inline(always)] fn swap4(self) -> Self { swapi!(self, 4, 0xf0) } #[inline(always)] fn swap8(self) -> Self { u128x1_sse2::new(unsafe { _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8)) }) } #[inline(always)] fn swap16(self) -> Self { u128x1_sse2::new(swap16_s2(self.x)) } #[inline(always)] fn swap32(self) -> Self { u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) }) } #[inline(always)] fn swap64(self) -> Self { u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) }) } } #[derive(Copy, Clone)] pub struct G0; #[derive(Copy, Clone)] pub struct G1; #[allow(non_camel_case_types)] pub type u32x4x2_sse2 = x2, G0>; #[allow(non_camel_case_types)] pub type u64x2x2_sse2 = x2, G0>; #[allow(non_camel_case_types)] pub type u64x4_sse2 = x2, G1>; #[allow(non_camel_case_types)] pub type u128x2_sse2 = x2, G0>; #[allow(non_camel_case_types)] pub type u32x4x4_sse2 = x4>; #[allow(non_camel_case_types)] pub type u64x2x4_sse2 = x4>; #[allow(non_camel_case_types)] pub type u128x4_sse2 = x4>; impl u32x4x2> for u32x4x2_sse2 where u32x4_sse2: RotateEachWord32 + BSwap, Machine86: Machine, u32x4x2_sse2: MultiLane<[ as Machine>::u32x4; 2]>, u32x4x2_sse2: Vec2< as Machine>::u32x4>, { } impl u64x2x2> for u64x2x2_sse2 where u64x2_sse2: RotateEachWord64 + RotateEachWord32 + BSwap, Machine86: Machine, u64x2x2_sse2: MultiLane<[ as Machine>::u64x2; 2]>, u64x2x2_sse2: Vec2< as Machine>::u64x2>, { } impl u64x4> for u64x4_sse2 where u64x2_sse2: RotateEachWord64 + RotateEachWord32 + BSwap, Machine86: Machine, u64x4_sse2: MultiLane<[u64; 4]> + Vec4 + Words4, { } impl u128x2> for u128x2_sse2 where u128x1_sse2: Swap64 + BSwap, Machine86: Machine, u128x2_sse2: MultiLane<[ as Machine>::u128x1; 2]>, u128x2_sse2: Vec2< as Machine>::u128x1>, u128x2_sse2: Into< as Machine>::u32x4x2>, u128x2_sse2: Into< as Machine>::u64x2x2>, u128x2_sse2: Into< as Machine>::u64x4>, { } impl u32x4x2> for u32x4x2_sse2 where u32x4_sse2: RotateEachWord32 + BSwap, Avx2Machine: Machine, u32x4x2_sse2: MultiLane<[ as Machine>::u32x4; 2]>, u32x4x2_sse2: Vec2< as Machine>::u32x4>, { } impl u64x2x2> for u64x2x2_sse2 where u64x2_sse2: RotateEachWord64 + RotateEachWord32 + BSwap, Avx2Machine: Machine, u64x2x2_sse2: MultiLane<[ as Machine>::u64x2; 2]>, u64x2x2_sse2: Vec2< as Machine>::u64x2>, { } impl u64x4> for u64x4_sse2 where u64x2_sse2: RotateEachWord64 + RotateEachWord32 + BSwap, Avx2Machine: Machine, u64x4_sse2: MultiLane<[u64; 4]> + Vec4 + Words4, { } impl u128x2> for u128x2_sse2 where u128x1_sse2: Swap64 + BSwap, Avx2Machine: Machine, u128x2_sse2: MultiLane<[ as Machine>::u128x1; 2]>, u128x2_sse2: Vec2< as Machine>::u128x1>, u128x2_sse2: Into< as Machine>::u32x4x2>, u128x2_sse2: Into< as Machine>::u64x2x2>, u128x2_sse2: Into< as Machine>::u64x4>, { } impl Vec4 for u64x4_sse2 where u64x2_sse2: Copy + Vec2, { #[inline(always)] fn extract(self, i: u32) -> u64 { match i { 0 => self.0[0].extract(0), 1 => self.0[0].extract(1), 2 => self.0[1].extract(0), 3 => self.0[1].extract(1), _ => panic!(), } } #[inline(always)] fn insert(mut self, w: u64, i: u32) -> Self { match i { 0 => self.0[0] = self.0[0].insert(w, 0), 1 => self.0[0] = self.0[0].insert(w, 1), 2 => self.0[1] = self.0[1].insert(w, 0), 3 => self.0[1] = self.0[1].insert(w, 1), _ => panic!(), }; self } } impl u32x4x4> for u32x4x4_sse2 where u32x4_sse2: RotateEachWord32 + BSwap, Machine86: Machine, u32x4x4_sse2: MultiLane<[ as Machine>::u32x4; 4]>, u32x4x4_sse2: Vec4< as Machine>::u32x4>, { } impl u64x2x4> for u64x2x4_sse2 where u64x2_sse2: RotateEachWord64 + RotateEachWord32 + BSwap, Machine86: Machine, u64x2x4_sse2: MultiLane<[ as Machine>::u64x2; 4]>, u64x2x4_sse2: Vec4< as Machine>::u64x2>, { } impl u128x4> for u128x4_sse2 where u128x1_sse2: Swap64 + BSwap, Machine86: Machine, u128x4_sse2: MultiLane<[ as Machine>::u128x1; 4]>, u128x4_sse2: Vec4< as Machine>::u128x1>, u128x4_sse2: Into< as Machine>::u32x4x4>, u128x4_sse2: Into< as Machine>::u64x2x4>, { } impl u32x4x4> for u32x4x4_sse2 where u32x4_sse2: RotateEachWord32 + BSwap, Avx2Machine: Machine, u32x4x4_sse2: MultiLane<[ as Machine>::u32x4; 4]>, u32x4x4_sse2: Vec4< as Machine>::u32x4>, { } impl u64x2x4> for u64x2x4_sse2 where u64x2_sse2: RotateEachWord64 + RotateEachWord32 + BSwap, Avx2Machine: Machine, u64x2x4_sse2: MultiLane<[ as Machine>::u64x2; 4]>, u64x2x4_sse2: Vec4< as Machine>::u64x2>, { } impl u128x4> for u128x4_sse2 where u128x1_sse2: Swap64 + BSwap, Avx2Machine: Machine, u128x4_sse2: MultiLane<[ as Machine>::u128x1; 4]>, u128x4_sse2: Vec4< as Machine>::u128x1>, u128x4_sse2: Into< as Machine>::u32x4x4>, u128x4_sse2: Into< as Machine>::u64x2x4>, { } macro_rules! impl_into_x { ($from:ident, $to:ident) => { impl From, Gf>> for x2<$to, Gt> { #[inline(always)] fn from(x: x2<$from, Gf>) -> Self { x2::new([$to::from(x.0[0]), $to::from(x.0[1])]) } } impl From>> for x4<$to> { #[inline(always)] fn from(x: x4<$from>) -> Self { x4::new([ $to::from(x.0[0]), $to::from(x.0[1]), $to::from(x.0[2]), $to::from(x.0[3]), ]) } } }; } impl_into_x!(u128x1_sse2, u64x2_sse2); impl_into_x!(u128x1_sse2, u32x4_sse2); ///// Debugging use core::fmt::{Debug, Formatter, Result}; impl PartialEq for x2 { #[inline(always)] fn eq(&self, rhs: &Self) -> bool { self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1] } } #[inline(always)] unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool { let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110); _mm_cvtsi128_si64(q) == -1 } #[inline(always)] unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool { let q = _mm_cmpeq_epi32(x, y); let p = _mm_cvtsi128_si64(_mm_srli_si128(q, 8)); let q = _mm_cvtsi128_si64(q); (p & q) == -1 } impl PartialEq for u32x4_sse2 { #[inline(always)] fn eq(&self, rhs: &Self) -> bool { unsafe { eq128_s2(self.x, rhs.x) } } } impl Debug for u32x4_sse2 where Self: Copy + MultiLane<[u32; 4]>, { #[cold] fn fmt(&self, fmt: &mut Formatter) -> Result { fmt.write_fmt(format_args!("{:08x?}", &self.to_lanes())) } } impl PartialEq for u64x2_sse2 { #[inline(always)] fn eq(&self, rhs: &Self) -> bool { unsafe { eq128_s2(self.x, rhs.x) } } } impl Debug for u64x2_sse2 where Self: Copy + MultiLane<[u64; 2]>, { #[cold] fn fmt(&self, fmt: &mut Formatter) -> Result { fmt.write_fmt(format_args!("{:016x?}", &self.to_lanes())) } } impl Debug for u64x4_sse2 where u64x2_sse2: Copy + MultiLane<[u64; 2]>, { #[cold] fn fmt(&self, fmt: &mut Formatter) -> Result { let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes()); fmt.write_fmt(format_args!("{:016x?}", &[a[0], a[1], b[0], b[1]])) } } #[cfg(test)] #[cfg(target_arch = "x86_64")] mod test { use super::*; use crate::x86_64::{SSE2, SSE41, SSSE3}; use crate::Machine; #[test] #[cfg_attr(not(target_feature = "ssse3"), ignore)] fn test_bswap32_s2_vs_s3() { let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100]; let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203]; let s2 = unsafe { SSE2::instance() }; let s3 = unsafe { SSSE3::instance() }; let x_s2 = { let x_s2: ::u32x4 = s2.vec(xs); x_s2.bswap() }; let x_s3 = { let x_s3: ::u32x4 = s3.vec(xs); x_s3.bswap() }; assert_eq!(x_s2, unsafe { core::mem::transmute(x_s3) }); assert_eq!(x_s2, s2.vec(ys)); } #[test] #[cfg_attr(not(target_feature = "ssse3"), ignore)] fn test_bswap64_s2_vs_s3() { let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100]; let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607]; let s2 = unsafe { SSE2::instance() }; let s3 = unsafe { SSSE3::instance() }; let x_s2 = { let x_s2: ::u64x2 = s2.vec(xs); x_s2.bswap() }; let x_s3 = { let x_s3: ::u64x2 = s3.vec(xs); x_s3.bswap() }; assert_eq!(x_s2, s2.vec(ys)); assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); } #[test] #[cfg_attr(not(target_feature = "ssse3"), ignore)] fn test_shuffle32_s2_vs_s3() { let xs = [0x0, 0x1, 0x2, 0x3]; let ys = [0x2, 0x3, 0x0, 0x1]; let zs = [0x1, 0x2, 0x3, 0x0]; let s2 = unsafe { SSE2::instance() }; let s3 = unsafe { SSSE3::instance() }; let x_s2 = { let x_s2: ::u32x4 = s2.vec(xs); x_s2.shuffle2301() }; let x_s3 = { let x_s3: ::u32x4 = s3.vec(xs); x_s3.shuffle2301() }; assert_eq!(x_s2, s2.vec(ys)); assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); let x_s2 = { let x_s2: ::u32x4 = s2.vec(xs); x_s2.shuffle3012() }; let x_s3 = { let x_s3: ::u32x4 = s3.vec(xs); x_s3.shuffle3012() }; assert_eq!(x_s2, s2.vec(zs)); assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); let x_s2 = x_s2.shuffle1230(); let x_s3 = x_s3.shuffle1230(); assert_eq!(x_s2, s2.vec(xs)); assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); } #[test] #[cfg_attr(not(target_feature = "ssse3"), ignore)] fn test_shuffle64_s2_vs_s3() { let xs = [0x0, 0x1, 0x2, 0x3]; let ys = [0x2, 0x3, 0x0, 0x1]; let zs = [0x1, 0x2, 0x3, 0x0]; let s2 = unsafe { SSE2::instance() }; let s3 = unsafe { SSSE3::instance() }; let x_s2 = { let x_s2: ::u64x4 = s2.vec(xs); x_s2.shuffle2301() }; let x_s3 = { let x_s3: ::u64x4 = s3.vec(xs); x_s3.shuffle2301() }; assert_eq!(x_s2, s2.vec(ys)); assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); let x_s2 = { let x_s2: ::u64x4 = s2.vec(xs); x_s2.shuffle3012() }; let x_s3 = { let x_s3: ::u64x4 = s3.vec(xs); x_s3.shuffle3012() }; assert_eq!(x_s2, s2.vec(zs)); assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); let x_s2 = x_s2.shuffle1230(); let x_s3 = x_s3.shuffle1230(); assert_eq!(x_s2, s2.vec(xs)); assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); } #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)] #[test] fn test_lanes_u32x4() { let xs = [0x1, 0x2, 0x3, 0x4]; let s2 = unsafe { SSE2::instance() }; let s3 = unsafe { SSSE3::instance() }; let s4 = unsafe { SSE41::instance() }; { let x_s2: ::u32x4 = s2.vec(xs); let y_s2 = ::u32x4::from_lanes(xs); assert_eq!(x_s2, y_s2); assert_eq!(xs, y_s2.to_lanes()); } { let x_s3: ::u32x4 = s3.vec(xs); let y_s3 = ::u32x4::from_lanes(xs); assert_eq!(x_s3, y_s3); assert_eq!(xs, y_s3.to_lanes()); } { let x_s4: ::u32x4 = s4.vec(xs); let y_s4 = ::u32x4::from_lanes(xs); assert_eq!(x_s4, y_s4); assert_eq!(xs, y_s4.to_lanes()); } } #[test] #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)] fn test_lanes_u64x2() { let xs = [0x1, 0x2]; let s2 = unsafe { SSE2::instance() }; let s3 = unsafe { SSSE3::instance() }; let s4 = unsafe { SSE41::instance() }; { let x_s2: ::u64x2 = s2.vec(xs); let y_s2 = ::u64x2::from_lanes(xs); assert_eq!(x_s2, y_s2); assert_eq!(xs, y_s2.to_lanes()); } { let x_s3: ::u64x2 = s3.vec(xs); let y_s3 = ::u64x2::from_lanes(xs); assert_eq!(x_s3, y_s3); assert_eq!(xs, y_s3.to_lanes()); } { let x_s4: ::u64x2 = s4.vec(xs); let y_s4 = ::u64x2::from_lanes(xs); assert_eq!(x_s4, y_s4); assert_eq!(xs, y_s4.to_lanes()); } } #[test] fn test_vec4_u32x4_s2() { let xs = [1, 2, 3, 4]; let s2 = unsafe { SSE2::instance() }; let x_s2: ::u32x4 = s2.vec(xs); assert_eq!(x_s2.extract(0), 1); assert_eq!(x_s2.extract(1), 2); assert_eq!(x_s2.extract(2), 3); assert_eq!(x_s2.extract(3), 4); assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4])); assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4])); assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4])); assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf])); } #[test] #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)] fn test_vec4_u32x4_s4() { let xs = [1, 2, 3, 4]; let s4 = unsafe { SSE41::instance() }; let x_s4: ::u32x4 = s4.vec(xs); assert_eq!(x_s4.extract(0), 1); assert_eq!(x_s4.extract(1), 2); assert_eq!(x_s4.extract(2), 3); assert_eq!(x_s4.extract(3), 4); assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4])); assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4])); assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4])); assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf])); } #[test] fn test_vec2_u64x2_s2() { let xs = [0x1, 0x2]; let s2 = unsafe { SSE2::instance() }; let x_s2: ::u64x2 = s2.vec(xs); assert_eq!(x_s2.extract(0), 1); assert_eq!(x_s2.extract(1), 2); assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2])); assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf])); } #[test] #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)] fn test_vec4_u64x2_s4() { let xs = [0x1, 0x2]; let s4 = unsafe { SSE41::instance() }; let x_s4: ::u64x2 = s4.vec(xs); assert_eq!(x_s4.extract(0), 1); assert_eq!(x_s4.extract(1), 2); assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2])); assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf])); } } pub mod avx2 { #![allow(non_camel_case_types)] use crate::soft::x4; use crate::types::*; use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2}; use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4}; use core::arch::x86_64::*; use core::marker::PhantomData; use core::ops::*; #[derive(Copy, Clone)] pub struct u32x4x4_avx2 { x: [__m256i; 2], ni: PhantomData, } impl u32x4x4_avx2 { #[inline(always)] fn new(x: [__m256i; 2]) -> Self { Self { x, ni: PhantomData } } } impl u32x4x4> for u32x4x4_avx2 where NI: Copy {} impl Store for u32x4x4_avx2 { #[inline(always)] unsafe fn unpack(p: vec512_storage) -> Self { Self::new([p.avx[0].avx, p.avx[1].avx]) } } impl MultiLane<[u32x4_sse2; 4]> for u32x4x4_avx2 { #[inline(always)] fn to_lanes(self) -> [u32x4_sse2; 4] { unsafe { [ u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)), u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)), u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)), u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)), ] } } #[inline(always)] fn from_lanes(x: [u32x4_sse2; 4]) -> Self { Self::new(unsafe { [ _mm256_setr_m128i(x[0].x, x[1].x), _mm256_setr_m128i(x[2].x, x[3].x), ] }) } } impl Vec4> for u32x4x4_avx2 { #[inline(always)] fn extract(self, i: u32) -> u32x4_sse2 { unsafe { match i { 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)), 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)), 2 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)), 3 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)), _ => panic!(), } } } #[inline(always)] fn insert(self, w: u32x4_sse2, i: u32) -> Self { Self::new(unsafe { match i { 0 => [_mm256_inserti128_si256(self.x[0], w.x, 0), self.x[1]], 1 => [_mm256_inserti128_si256(self.x[0], w.x, 1), self.x[1]], 2 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 0)], 3 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 1)], _ => panic!(), } }) } } impl LaneWords4 for u32x4x4_avx2 { #[inline(always)] fn shuffle_lane_words1230(self) -> Self { Self::new(unsafe { [ _mm256_shuffle_epi32(self.x[0], 0b1001_0011), _mm256_shuffle_epi32(self.x[1], 0b1001_0011), ] }) } #[inline(always)] fn shuffle_lane_words2301(self) -> Self { Self::new(unsafe { [ _mm256_shuffle_epi32(self.x[0], 0b0100_1110), _mm256_shuffle_epi32(self.x[1], 0b0100_1110), ] }) } #[inline(always)] fn shuffle_lane_words3012(self) -> Self { Self::new(unsafe { [ _mm256_shuffle_epi32(self.x[0], 0b0011_1001), _mm256_shuffle_epi32(self.x[1], 0b0011_1001), ] }) } } impl BitOps32 for u32x4x4_avx2 where NI: Copy {} impl ArithOps for u32x4x4_avx2 where NI: Copy {} macro_rules! shuf_lane_bytes { ($name:ident, $k0:expr, $k1:expr) => { #[inline(always)] fn $name(self) -> Self { Self::new(unsafe { [ _mm256_shuffle_epi8( self.x[0], _mm256_set_epi64x($k0, $k1, $k0, $k1), ), _mm256_shuffle_epi8( self.x[1], _mm256_set_epi64x($k0, $k1, $k0, $k1), ) ] }) } }; } macro_rules! rotr_32 { ($name:ident, $i:expr) => { #[inline(always)] fn $name(self) -> Self { Self::new(unsafe { [ _mm256_or_si256( _mm256_srli_epi32(self.x[0], $i as i32), _mm256_slli_epi32(self.x[0], 32 - $i as i32), ), _mm256_or_si256( _mm256_srli_epi32(self.x[1], $i as i32), _mm256_slli_epi32(self.x[1], 32 - $i as i32), ) ] }) } }; } impl RotateEachWord32 for u32x4x4_avx2 { rotr_32!(rotate_each_word_right7, 7); shuf_lane_bytes!( rotate_each_word_right8, 0x0c0f0e0d_080b0a09, 0x04070605_00030201 ); rotr_32!(rotate_each_word_right11, 11); rotr_32!(rotate_each_word_right12, 12); shuf_lane_bytes!( rotate_each_word_right16, 0x0d0c0f0e_09080b0a, 0x05040706_01000302 ); rotr_32!(rotate_each_word_right20, 20); shuf_lane_bytes!( rotate_each_word_right24, 0x0e0d0c0f_0a09080b, 0x06050407_02010003 ); rotr_32!(rotate_each_word_right25, 25); } impl BitOps0 for u32x4x4_avx2 where NI: Copy {} impl From> for vec512_storage { #[inline(always)] fn from(x: u32x4x4_avx2) -> Self { Self { avx: [ vec256_storage { avx: x.x[0] }, vec256_storage { avx: x.x[1] }, ], } } } macro_rules! impl_assign { ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => { impl