From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- vendor/ppv-lite86/src/x86_64/mod.rs | 433 +++++++++ vendor/ppv-lite86/src/x86_64/sse2.rs | 1632 ++++++++++++++++++++++++++++++++++ 2 files changed, 2065 insertions(+) create mode 100644 vendor/ppv-lite86/src/x86_64/mod.rs create mode 100644 vendor/ppv-lite86/src/x86_64/sse2.rs (limited to 'vendor/ppv-lite86/src/x86_64') diff --git a/vendor/ppv-lite86/src/x86_64/mod.rs b/vendor/ppv-lite86/src/x86_64/mod.rs new file mode 100644 index 000000000..ecf184f36 --- /dev/null +++ b/vendor/ppv-lite86/src/x86_64/mod.rs @@ -0,0 +1,433 @@ +// crate minimums: sse2, x86_64 + +use core::arch::x86_64::{__m128i, __m256i}; +use crate::types::*; + +mod sse2; + +#[derive(Copy, Clone)] +pub struct YesS3; +#[derive(Copy, Clone)] +pub struct NoS3; + +#[derive(Copy, Clone)] +pub struct YesS4; +#[derive(Copy, Clone)] +pub struct NoS4; + +#[derive(Copy, Clone)] +pub struct YesA1; +#[derive(Copy, Clone)] +pub struct NoA1; + +#[derive(Copy, Clone)] +pub struct YesA2; +#[derive(Copy, Clone)] +pub struct NoA2; + +#[derive(Copy, Clone)] +pub struct YesNI; +#[derive(Copy, Clone)] +pub struct NoNI; + +use core::marker::PhantomData; + +#[derive(Copy, Clone)] +pub struct SseMachine(PhantomData<(S3, S4, NI)>); +impl Machine for SseMachine +where + sse2::u128x1_sse2: Swap64, + sse2::u64x2_sse2: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2, + sse2::u32x4_sse2: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4, + sse2::u64x4_sse2: BSwap + Words4, + sse2::u128x1_sse2: BSwap, + sse2::u128x2_sse2: Into>, + sse2::u128x2_sse2: Into>, + sse2::u128x2_sse2: Into>, + sse2::u128x4_sse2: Into>, + sse2::u128x4_sse2: Into>, +{ + type u32x4 = sse2::u32x4_sse2; + type u64x2 = sse2::u64x2_sse2; + type u128x1 = sse2::u128x1_sse2; + + type u32x4x2 = sse2::u32x4x2_sse2; + type u64x2x2 = sse2::u64x2x2_sse2; + type u64x4 = sse2::u64x4_sse2; + type u128x2 = sse2::u128x2_sse2; + + type u32x4x4 = sse2::u32x4x4_sse2; + type u64x2x4 = sse2::u64x2x4_sse2; + type u128x4 = sse2::u128x4_sse2; + + #[inline(always)] + unsafe fn instance() -> Self { + SseMachine(PhantomData) + } +} + +#[derive(Copy, Clone)] +pub struct Avx2Machine(PhantomData); +impl Machine for Avx2Machine +where + sse2::u128x1_sse2: BSwap + Swap64, + sse2::u64x2_sse2: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2, + sse2::u32x4_sse2: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4, + sse2::u64x4_sse2: BSwap + Words4, +{ + type u32x4 = sse2::u32x4_sse2; + type u64x2 = sse2::u64x2_sse2; + type u128x1 = sse2::u128x1_sse2; + + type u32x4x2 = sse2::u32x4x2_sse2; + type u64x2x2 = sse2::u64x2x2_sse2; + type u64x4 = sse2::u64x4_sse2; + type u128x2 = sse2::u128x2_sse2; + + type u32x4x4 = sse2::avx2::u32x4x4_avx2; + type u64x2x4 = sse2::u64x2x4_sse2; + type u128x4 = sse2::u128x4_sse2; + + #[inline(always)] + unsafe fn instance() -> Self { + Avx2Machine(PhantomData) + } +} + +pub type SSE2 = SseMachine; +pub type SSSE3 = SseMachine; +pub type SSE41 = SseMachine; +/// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything +/// to avoid expensive SSE/VEX conflicts. +pub type AVX = SseMachine; +pub type AVX2 = Avx2Machine; + +/// Generic wrapper for unparameterized storage of any of the possible impls. +/// Converting into and out of this type should be essentially free, although it may be more +/// aligned than a particular impl requires. +#[allow(non_camel_case_types)] +#[derive(Copy, Clone)] +pub union vec128_storage { + u32x4: [u32; 4], + u64x2: [u64; 2], + u128x1: [u128; 1], + sse2: __m128i, +} +impl Store for vec128_storage { + #[inline(always)] + unsafe fn unpack(p: vec128_storage) -> Self { + p + } +} +impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage { + #[inline(always)] + fn into(self) -> &'a [u32; 4] { + unsafe { &self.u32x4 } + } +} +impl Into for [u32; 4] { + #[inline(always)] + fn into(self) -> vec128_storage { + vec128_storage { u32x4: self } + } +} +impl Default for vec128_storage { + #[inline(always)] + fn default() -> Self { + vec128_storage { u128x1: [0] } + } +} +impl Eq for vec128_storage {} +impl PartialEq for vec128_storage { + #[inline(always)] + fn eq(&self, rhs: &Self) -> bool { + unsafe { self.u128x1 == rhs.u128x1 } + } +} + +#[allow(non_camel_case_types)] +#[derive(Copy, Clone)] +pub union vec256_storage { + u32x8: [u32; 8], + u64x4: [u64; 4], + u128x2: [u128; 2], + sse2: [vec128_storage; 2], + avx: __m256i, +} +impl Into for [u64; 4] { + #[inline(always)] + fn into(self) -> vec256_storage { + vec256_storage { u64x4: self } + } +} +impl Default for vec256_storage { + #[inline(always)] + fn default() -> Self { + vec256_storage { u128x2: [0, 0] } + } +} +impl vec256_storage { + pub fn new128(xs: [vec128_storage; 2]) -> Self { + Self { sse2: xs } + } + pub fn split128(self) -> [vec128_storage; 2] { + unsafe { self.sse2 } + } +} +impl Eq for vec256_storage {} +impl PartialEq for vec256_storage { + #[inline(always)] + fn eq(&self, rhs: &Self) -> bool { + unsafe { self.sse2 == rhs.sse2 } + } +} + +#[allow(non_camel_case_types)] +#[derive(Copy, Clone)] +pub union vec512_storage { + u32x16: [u32; 16], + u64x8: [u64; 8], + u128x4: [u128; 4], + sse2: [vec128_storage; 4], + avx: [vec256_storage; 2], +} +impl Default for vec512_storage { + #[inline(always)] + fn default() -> Self { + vec512_storage { + u128x4: [0, 0, 0, 0], + } + } +} +impl vec512_storage { + pub fn new128(xs: [vec128_storage; 4]) -> Self { + Self { sse2: xs } + } + pub fn split128(self) -> [vec128_storage; 4] { + unsafe { self.sse2 } + } +} +impl Eq for vec512_storage {} +impl PartialEq for vec512_storage { + #[inline(always)] + fn eq(&self, rhs: &Self) -> bool { + unsafe { self.avx == rhs.avx } + } +} + +macro_rules! impl_into { + ($storage:ident, $array:ty, $name:ident) => { + impl Into<$array> for $storage { + #[inline(always)] + fn into(self) -> $array { + unsafe { self.$name } + } + } + }; +} +impl_into!(vec128_storage, [u32; 4], u32x4); +impl_into!(vec128_storage, [u64; 2], u64x2); +impl_into!(vec128_storage, [u128; 1], u128x1); +impl_into!(vec256_storage, [u32; 8], u32x8); +impl_into!(vec256_storage, [u64; 4], u64x4); +impl_into!(vec256_storage, [u128; 2], u128x2); +impl_into!(vec512_storage, [u32; 16], u32x16); +impl_into!(vec512_storage, [u64; 8], u64x8); +impl_into!(vec512_storage, [u128; 4], u128x4); + +/// Generate the full set of optimized implementations to take advantage of the most important +/// hardware feature sets. +/// +/// This dispatcher is suitable for maximizing throughput. +#[macro_export] +macro_rules! dispatch { + ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { + #[cfg(feature = "std")] + $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { + #[inline(always)] + fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body + use std::arch::x86_64::*; + #[target_feature(enable = "avx2")] + unsafe fn impl_avx2($($arg: $argty),*) -> $ret { + let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*); + _mm256_zeroupper(); + ret + } + #[target_feature(enable = "avx")] + #[target_feature(enable = "sse4.1")] + #[target_feature(enable = "ssse3")] + unsafe fn impl_avx($($arg: $argty),*) -> $ret { + let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*); + _mm256_zeroupper(); + ret + } + #[target_feature(enable = "sse4.1")] + #[target_feature(enable = "ssse3")] + unsafe fn impl_sse41($($arg: $argty),*) -> $ret { + fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) + } + #[target_feature(enable = "ssse3")] + unsafe fn impl_ssse3($($arg: $argty),*) -> $ret { + fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) + } + #[target_feature(enable = "sse2")] + unsafe fn impl_sse2($($arg: $argty),*) -> $ret { + fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) + } + unsafe { + if is_x86_feature_detected!("avx2") { + impl_avx2($($arg),*) + } else if is_x86_feature_detected!("avx") { + impl_avx($($arg),*) + } else if is_x86_feature_detected!("sse4.1") { + impl_sse41($($arg),*) + } else if is_x86_feature_detected!("ssse3") { + impl_ssse3($($arg),*) + } else if is_x86_feature_detected!("sse2") { + impl_sse2($($arg),*) + } else { + unimplemented!() + } + } + } + #[cfg(not(feature = "std"))] + #[inline(always)] + $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { + unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body + unsafe { + if cfg!(target_feature = "avx2") { + fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) + } else if cfg!(target_feature = "avx") { + fn_impl($crate::x86_64::AVX::instance(), $($arg),*) + } else if cfg!(target_feature = "sse4.1") { + fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) + } else if cfg!(target_feature = "ssse3") { + fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) + } else { + fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) + } + } + } + }; + ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { + dispatch!($mach, $MTy, { + $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body + }); + } +} + +/// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit +/// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX. +/// +/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware +/// features (e.g. because they are done infrequently), so minimizing their contribution to code +/// size is more important. +#[macro_export] +macro_rules! dispatch_light128 { + ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { + #[cfg(feature = "std")] + $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret { + #[inline(always)] + fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body + use std::arch::x86_64::*; + #[target_feature(enable = "avx")] + unsafe fn impl_avx($($arg: $argty),*) -> $ret { + fn_impl($crate::x86_64::AVX::instance(), $($arg),*) + } + #[target_feature(enable = "sse2")] + unsafe fn impl_sse2($($arg: $argty),*) -> $ret { + fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) + } + unsafe { + if is_x86_feature_detected!("avx") { + impl_avx($($arg),*) + } else if is_x86_feature_detected!("sse2") { + impl_sse2($($arg),*) + } else { + unimplemented!() + } + } + } + #[cfg(not(feature = "std"))] + #[inline(always)] + $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { + unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body + unsafe { + if cfg!(target_feature = "avx2") { + fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) + } else if cfg!(target_feature = "avx") { + fn_impl($crate::x86_64::AVX::instance(), $($arg),*) + } else if cfg!(target_feature = "sse4.1") { + fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) + } else if cfg!(target_feature = "ssse3") { + fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) + } else { + fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) + } + } + } + }; + ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { + dispatch_light128!($mach, $MTy, { + $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body + }); + } +} + +/// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit +/// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2. +/// +/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware +/// features (e.g. because they are done infrequently), so minimizing their contribution to code +/// size is more important. +#[macro_export] +macro_rules! dispatch_light256 { + ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { + #[cfg(feature = "std")] + $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret { + #[inline(always)] + fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body + use std::arch::x86_64::*; + #[target_feature(enable = "avx")] + unsafe fn impl_avx($($arg: $argty),*) -> $ret { + fn_impl($crate::x86_64::AVX::instance(), $($arg),*) + } + #[target_feature(enable = "sse2")] + unsafe fn impl_sse2($($arg: $argty),*) -> $ret { + fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) + } + unsafe { + if is_x86_feature_detected!("avx") { + impl_avx($($arg),*) + } else if is_x86_feature_detected!("sse2") { + impl_sse2($($arg),*) + } else { + unimplemented!() + } + } + } + #[cfg(not(feature = "std"))] + #[inline(always)] + $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { + unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body + unsafe { + if cfg!(target_feature = "avx2") { + fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) + } else if cfg!(target_feature = "avx") { + fn_impl($crate::x86_64::AVX::instance(), $($arg),*) + } else if cfg!(target_feature = "sse4.1") { + fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) + } else if cfg!(target_feature = "ssse3") { + fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) + } else { + fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) + } + } + } + }; + ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { + dispatch_light256!($mach, $MTy, { + $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body + }); + } +} diff --git a/vendor/ppv-lite86/src/x86_64/sse2.rs b/vendor/ppv-lite86/src/x86_64/sse2.rs new file mode 100644 index 000000000..60e7681c3 --- /dev/null +++ b/vendor/ppv-lite86/src/x86_64/sse2.rs @@ -0,0 +1,1632 @@ +use crate::soft::{x2, x4}; +use crate::types::*; +use crate::vec128_storage; +use crate::x86_64::Avx2Machine; +use crate::x86_64::SseMachine as Machine86; +use crate::x86_64::{NoS3, NoS4, YesS3, YesS4}; +use core::arch::x86_64::*; +use core::marker::PhantomData; +use core::ops::{ + Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not, +}; + +macro_rules! impl_binop { + ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => { + impl $trait for $vec { + type Output = Self; + #[inline(always)] + fn $fn(self, rhs: Self) -> Self::Output { + Self::new(unsafe { $impl_fn(self.x, rhs.x) }) + } + } + }; +} + +macro_rules! impl_binop_assign { + ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => { + impl $trait for $vec + where + $vec: Copy, + { + #[inline(always)] + fn $fn_assign(&mut self, rhs: Self) { + *self = self.$fn(rhs); + } + } + }; +} + +macro_rules! def_vec { + ($vec:ident, $word:ident) => { + #[allow(non_camel_case_types)] + #[derive(Copy, Clone)] + pub struct $vec { + x: __m128i, + s3: PhantomData, + s4: PhantomData, + ni: PhantomData, + } + + impl Store for $vec { + #[inline(always)] + unsafe fn unpack(x: vec128_storage) -> Self { + Self::new(x.sse2) + } + } + impl From<$vec> for vec128_storage { + #[inline(always)] + fn from(x: $vec) -> Self { + vec128_storage { sse2: x.x } + } + } + impl $vec { + #[inline(always)] + fn new(x: __m128i) -> Self { + $vec { + x, + s3: PhantomData, + s4: PhantomData, + ni: PhantomData, + } + } + } + + impl StoreBytes for $vec + where + Self: BSwap, + { + #[inline(always)] + unsafe fn unsafe_read_le(input: &[u8]) -> Self { + assert_eq!(input.len(), 16); + Self::new(_mm_loadu_si128(input.as_ptr() as *const _)) + } + #[inline(always)] + unsafe fn unsafe_read_be(input: &[u8]) -> Self { + assert_eq!(input.len(), 16); + Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap() + } + #[inline(always)] + fn write_le(self, out: &mut [u8]) { + assert_eq!(out.len(), 16); + unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) } + } + #[inline(always)] + fn write_be(self, out: &mut [u8]) { + assert_eq!(out.len(), 16); + let x = self.bswap().x; + unsafe { + _mm_storeu_si128(out.as_mut_ptr() as *mut _, x); + } + } + } + + impl Default for $vec { + #[inline(always)] + fn default() -> Self { + Self::new(unsafe { _mm_setzero_si128() }) + } + } + + impl Not for $vec { + type Output = Self; + #[inline(always)] + fn not(self) -> Self::Output { + unsafe { + let ff = _mm_set1_epi64x(-1i64); + self ^ Self::new(ff) + } + } + } + + impl BitOps0 for $vec {} + impl_binop!($vec, BitAnd, bitand, _mm_and_si128); + impl_binop!($vec, BitOr, bitor, _mm_or_si128); + impl_binop!($vec, BitXor, bitxor, _mm_xor_si128); + impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand); + impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor); + impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor); + impl AndNot for $vec { + type Output = Self; + #[inline(always)] + fn andnot(self, rhs: Self) -> Self { + Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) }) + } + } + }; +} + +macro_rules! impl_bitops32 { + ($vec:ident) => { + impl BitOps32 for $vec where + $vec: RotateEachWord32 + { + } + }; +} + +macro_rules! impl_bitops64 { + ($vec:ident) => { + impl_bitops32!($vec); + impl BitOps64 for $vec where + $vec: RotateEachWord64 + RotateEachWord32 + { + } + }; +} + +macro_rules! impl_bitops128 { + ($vec:ident) => { + impl_bitops64!($vec); + impl BitOps128 for $vec where + $vec: RotateEachWord128 + { + } + }; +} + +macro_rules! rotr_32_s3 { + ($name:ident, $k0:expr, $k1:expr) => { + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + _mm_shuffle_epi8( + self.x, + _mm_set_epi64x($k0, $k1), + ) + }) + } + }; +} +macro_rules! rotr_32 { + ($name:ident, $i:expr) => { + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + _mm_or_si128( + _mm_srli_epi32(self.x, $i as i32), + _mm_slli_epi32(self.x, 32 - $i as i32), + ) + }) + } + }; +} +impl RotateEachWord32 for u32x4_sse2 { + rotr_32!(rotate_each_word_right7, 7); + rotr_32_s3!( + rotate_each_word_right8, + 0x0c0f0e0d_080b0a09, + 0x04070605_00030201 + ); + rotr_32!(rotate_each_word_right11, 11); + rotr_32!(rotate_each_word_right12, 12); + rotr_32_s3!( + rotate_each_word_right16, + 0x0d0c0f0e_09080b0a, + 0x05040706_01000302 + ); + rotr_32!(rotate_each_word_right20, 20); + rotr_32_s3!( + rotate_each_word_right24, + 0x0e0d0c0f_0a09080b, + 0x06050407_02010003 + ); + rotr_32!(rotate_each_word_right25, 25); +} +impl RotateEachWord32 for u32x4_sse2 { + rotr_32!(rotate_each_word_right7, 7); + rotr_32!(rotate_each_word_right8, 8); + rotr_32!(rotate_each_word_right11, 11); + rotr_32!(rotate_each_word_right12, 12); + #[inline(always)] + fn rotate_each_word_right16(self) -> Self { + Self::new(swap16_s2(self.x)) + } + rotr_32!(rotate_each_word_right20, 20); + rotr_32!(rotate_each_word_right24, 24); + rotr_32!(rotate_each_word_right25, 25); +} + +macro_rules! rotr_64_s3 { + ($name:ident, $k0:expr, $k1:expr) => { + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + _mm_shuffle_epi8( + self.x, + _mm_set_epi64x($k0, $k1), + ) + }) + } + }; +} +macro_rules! rotr_64 { + ($name:ident, $i:expr) => { + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + _mm_or_si128( + _mm_srli_epi64(self.x, $i as i32), + _mm_slli_epi64(self.x, 64 - $i as i32), + ) + }) + } + }; +} +impl RotateEachWord32 for u64x2_sse2 { + rotr_64!(rotate_each_word_right7, 7); + rotr_64_s3!( + rotate_each_word_right8, + 0x080f_0e0d_0c0b_0a09, + 0x0007_0605_0403_0201 + ); + rotr_64!(rotate_each_word_right11, 11); + rotr_64!(rotate_each_word_right12, 12); + rotr_64_s3!( + rotate_each_word_right16, + 0x0908_0f0e_0d0c_0b0a, + 0x0100_0706_0504_0302 + ); + rotr_64!(rotate_each_word_right20, 20); + rotr_64_s3!( + rotate_each_word_right24, + 0x0a09_080f_0e0d_0c0b, + 0x0201_0007_0605_0403 + ); + rotr_64!(rotate_each_word_right25, 25); +} +impl RotateEachWord32 for u64x2_sse2 { + rotr_64!(rotate_each_word_right7, 7); + rotr_64!(rotate_each_word_right8, 8); + rotr_64!(rotate_each_word_right11, 11); + rotr_64!(rotate_each_word_right12, 12); + #[inline(always)] + fn rotate_each_word_right16(self) -> Self { + Self::new(swap16_s2(self.x)) + } + rotr_64!(rotate_each_word_right20, 20); + rotr_64!(rotate_each_word_right24, 24); + rotr_64!(rotate_each_word_right25, 25); +} +impl RotateEachWord64 for u64x2_sse2 { + #[inline(always)] + fn rotate_each_word_right32(self) -> Self { + Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) }) + } +} + +macro_rules! rotr_128 { + ($name:ident, $i:expr) => { + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + _mm_or_si128( + _mm_srli_si128(self.x, $i as i32), + _mm_slli_si128(self.x, 128 - $i as i32), + ) + }) + } + }; +} +// TODO: completely unoptimized +impl RotateEachWord32 for u128x1_sse2 { + rotr_128!(rotate_each_word_right7, 7); + rotr_128!(rotate_each_word_right8, 8); + rotr_128!(rotate_each_word_right11, 11); + rotr_128!(rotate_each_word_right12, 12); + rotr_128!(rotate_each_word_right16, 16); + rotr_128!(rotate_each_word_right20, 20); + rotr_128!(rotate_each_word_right24, 24); + rotr_128!(rotate_each_word_right25, 25); +} +// TODO: completely unoptimized +impl RotateEachWord64 for u128x1_sse2 { + rotr_128!(rotate_each_word_right32, 32); +} +impl RotateEachWord128 for u128x1_sse2 {} + +def_vec!(u32x4_sse2, u32); +def_vec!(u64x2_sse2, u64); +def_vec!(u128x1_sse2, u128); + +impl MultiLane<[u32; 4]> for u32x4_sse2 { + #[inline(always)] + fn to_lanes(self) -> [u32; 4] { + unsafe { + let x = _mm_cvtsi128_si64(self.x) as u64; + let y = _mm_extract_epi64(self.x, 1) as u64; + [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32] + } + } + #[inline(always)] + fn from_lanes(xs: [u32; 4]) -> Self { + unsafe { + let mut x = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64); + x = _mm_insert_epi64(x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1); + Self::new(x) + } + } +} +impl MultiLane<[u32; 4]> for u32x4_sse2 { + #[inline(always)] + fn to_lanes(self) -> [u32; 4] { + unsafe { + let x = _mm_cvtsi128_si64(self.x) as u64; + let y = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64; + [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32] + } + } + #[inline(always)] + fn from_lanes(xs: [u32; 4]) -> Self { + unsafe { + let x = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64; + let y = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64; + let x = _mm_cvtsi64_si128(x); + let y = _mm_slli_si128(_mm_cvtsi64_si128(y), 8); + Self::new(_mm_or_si128(x, y)) + } + } +} +impl MultiLane<[u64; 2]> for u64x2_sse2 { + #[inline(always)] + fn to_lanes(self) -> [u64; 2] { + unsafe { + [ + _mm_cvtsi128_si64(self.x) as u64, + _mm_extract_epi64(self.x, 1) as u64, + ] + } + } + #[inline(always)] + fn from_lanes(xs: [u64; 2]) -> Self { + unsafe { + let mut x = _mm_cvtsi64_si128(xs[0] as i64); + x = _mm_insert_epi64(x, xs[1] as i64, 1); + Self::new(x) + } + } +} +impl MultiLane<[u64; 2]> for u64x2_sse2 { + #[inline(always)] + fn to_lanes(self) -> [u64; 2] { + unsafe { + [ + _mm_cvtsi128_si64(self.x) as u64, + _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64, + ] + } + } + #[inline(always)] + fn from_lanes(xs: [u64; 2]) -> Self { + unsafe { + let x = _mm_cvtsi64_si128(xs[0] as i64); + let y = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8); + Self::new(_mm_or_si128(x, y)) + } + } +} +impl MultiLane<[u128; 1]> for u128x1_sse2 { + #[inline(always)] + fn to_lanes(self) -> [u128; 1] { + unimplemented!() + } + #[inline(always)] + fn from_lanes(xs: [u128; 1]) -> Self { + unimplemented!() + } +} + +impl MultiLane<[u64; 4]> for u64x4_sse2 +where + u64x2_sse2: MultiLane<[u64; 2]> + Copy, +{ + #[inline(always)] + fn to_lanes(self) -> [u64; 4] { + let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes()); + [a[0], a[1], b[0], b[1]] + } + #[inline(always)] + fn from_lanes(xs: [u64; 4]) -> Self { + let (a, b) = ( + u64x2_sse2::from_lanes([xs[0], xs[1]]), + u64x2_sse2::from_lanes([xs[2], xs[3]]), + ); + x2::new([a, b]) + } +} + +macro_rules! impl_into { + ($from:ident, $to:ident) => { + impl From<$from> for $to { + #[inline(always)] + fn from(x: $from) -> Self { + $to::new(x.x) + } + } + }; +} + +impl_into!(u128x1_sse2, u32x4_sse2); +impl_into!(u128x1_sse2, u64x2_sse2); + +impl_bitops32!(u32x4_sse2); +impl_bitops64!(u64x2_sse2); +impl_bitops128!(u128x1_sse2); + +impl ArithOps for u32x4_sse2 where + u32x4_sse2: BSwap +{ +} +impl ArithOps for u64x2_sse2 where + u64x2_sse2: BSwap +{ +} +impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32); +impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64); +impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add); +impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add); + +impl u32x4> for u32x4_sse2 +where + u32x4_sse2: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4, + Machine86: Machine, +{ +} +impl u64x2> for u64x2_sse2 +where + u64x2_sse2: + RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2, + Machine86: Machine, +{ +} +impl u128x1> for u128x1_sse2 +where + u128x1_sse2: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap, + Machine86: Machine, + u128x1_sse2: Into< as Machine>::u32x4>, + u128x1_sse2: Into< as Machine>::u64x2>, +{ +} + +impl u32x4> for u32x4_sse2 +where + u32x4_sse2: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4, + Machine86: Machine, +{ +} +impl u64x2> for u64x2_sse2 +where + u64x2_sse2: + RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2, + Machine86: Machine, +{ +} +impl u128x1> for u128x1_sse2 +where + u128x1_sse2: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap, + Machine86: Machine, + u128x1_sse2: Into< as Machine>::u32x4>, + u128x1_sse2: Into< as Machine>::u64x2>, +{ +} + +impl UnsafeFrom<[u32; 4]> for u32x4_sse2 { + #[inline(always)] + unsafe fn unsafe_from(xs: [u32; 4]) -> Self { + Self::new(_mm_set_epi32( + xs[3] as i32, + xs[2] as i32, + xs[1] as i32, + xs[0] as i32, + )) + } +} + +impl Vec4 for u32x4_sse2 +where + Self: MultiLane<[u32; 4]>, +{ + #[inline(always)] + fn extract(self, i: u32) -> u32 { + self.to_lanes()[i as usize] + } + #[inline(always)] + fn insert(self, v: u32, i: u32) -> Self { + Self::new(unsafe { + match i { + 0 => _mm_insert_epi32(self.x, v as i32, 0), + 1 => _mm_insert_epi32(self.x, v as i32, 1), + 2 => _mm_insert_epi32(self.x, v as i32, 2), + 3 => _mm_insert_epi32(self.x, v as i32, 3), + _ => unreachable!(), + } + }) + } +} +impl Vec4 for u32x4_sse2 +where + Self: MultiLane<[u32; 4]>, +{ + #[inline(always)] + fn extract(self, i: u32) -> u32 { + self.to_lanes()[i as usize] + } + #[inline(always)] + fn insert(self, v: u32, i: u32) -> Self { + Self::new(unsafe { + match i { + 0 => { + let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x); + _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)) + } + 1 => { + let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000); + x = _mm_slli_si128(x, 4); + x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)); + _mm_shuffle_epi32(x, 0b1110_0001) + } + 2 => { + let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100); + x = _mm_slli_si128(x, 4); + x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)); + _mm_shuffle_epi32(x, 0b1100_1001) + } + 3 => { + let mut x = _mm_slli_si128(self.x, 4); + x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)); + _mm_shuffle_epi32(x, 0b0011_1001) + } + _ => unreachable!(), + } + }) + } +} + +impl LaneWords4 for u32x4_sse2 { + #[inline(always)] + fn shuffle_lane_words2301(self) -> Self { + self.shuffle2301() + } + #[inline(always)] + fn shuffle_lane_words1230(self) -> Self { + self.shuffle1230() + } + #[inline(always)] + fn shuffle_lane_words3012(self) -> Self { + self.shuffle3012() + } +} + +impl Words4 for u32x4_sse2 { + #[inline(always)] + fn shuffle2301(self) -> Self { + Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) }) + } + #[inline(always)] + fn shuffle1230(self) -> Self { + Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) }) + } + #[inline(always)] + fn shuffle3012(self) -> Self { + Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) }) + } +} + +impl Words4 for u64x4_sse2 { + #[inline(always)] + fn shuffle2301(self) -> Self { + x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)]) + } + #[inline(always)] + fn shuffle3012(self) -> Self { + unsafe { + x2::new([ + u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)), + u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)), + ]) + } + } + #[inline(always)] + fn shuffle1230(self) -> Self { + unsafe { + x2::new([ + u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)), + u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)), + ]) + } + } +} +impl Words4 for u64x4_sse2 { + #[inline(always)] + fn shuffle2301(self) -> Self { + x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)]) + } + #[inline(always)] + fn shuffle3012(self) -> Self { + unsafe { + let a = _mm_srli_si128(self.0[0].x, 8); + let b = _mm_slli_si128(self.0[0].x, 8); + let c = _mm_srli_si128(self.0[1].x, 8); + let d = _mm_slli_si128(self.0[1].x, 8); + let da = _mm_or_si128(d, a); + let bc = _mm_or_si128(b, c); + x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)]) + } + } + #[inline(always)] + fn shuffle1230(self) -> Self { + unsafe { + let a = _mm_srli_si128(self.0[0].x, 8); + let b = _mm_slli_si128(self.0[0].x, 8); + let c = _mm_srli_si128(self.0[1].x, 8); + let d = _mm_slli_si128(self.0[1].x, 8); + let da = _mm_or_si128(d, a); + let bc = _mm_or_si128(b, c); + x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)]) + } + } +} + +impl UnsafeFrom<[u64; 2]> for u64x2_sse2 { + #[inline(always)] + unsafe fn unsafe_from(xs: [u64; 2]) -> Self { + Self::new(_mm_set_epi64x(xs[1] as i64, xs[0] as i64)) + } +} + +impl Vec2 for u64x2_sse2 { + #[inline(always)] + fn extract(self, i: u32) -> u64 { + unsafe { + match i { + 0 => _mm_cvtsi128_si64(self.x) as u64, + 1 => _mm_extract_epi64(self.x, 1) as u64, + _ => unreachable!(), + } + } + } + #[inline(always)] + fn insert(self, x: u64, i: u32) -> Self { + Self::new(unsafe { + match i { + 0 => _mm_insert_epi64(self.x, x as i64, 0), + 1 => _mm_insert_epi64(self.x, x as i64, 1), + _ => unreachable!(), + } + }) + } +} +impl Vec2 for u64x2_sse2 { + #[inline(always)] + fn extract(self, i: u32) -> u64 { + unsafe { + match i { + 0 => _mm_cvtsi128_si64(self.x) as u64, + 1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64, + _ => unreachable!(), + } + } + } + #[inline(always)] + fn insert(self, x: u64, i: u32) -> Self { + Self::new(unsafe { + match i { + 0 => _mm_or_si128( + _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x), + _mm_cvtsi64_si128(x as i64), + ), + 1 => _mm_or_si128( + _mm_move_epi64(self.x), + _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8), + ), + _ => unreachable!(), + } + }) + } +} + +impl BSwap for u32x4_sse2 { + #[inline(always)] + fn bswap(self) -> Self { + Self::new(unsafe { + let k = _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203); + _mm_shuffle_epi8(self.x, k) + }) + } +} +#[inline(always)] +fn bswap32_s2(x: __m128i) -> __m128i { + unsafe { + let mut y = _mm_unpacklo_epi8(x, _mm_setzero_si128()); + y = _mm_shufflehi_epi16(y, 0b0001_1011); + y = _mm_shufflelo_epi16(y, 0b0001_1011); + let mut z = _mm_unpackhi_epi8(x, _mm_setzero_si128()); + z = _mm_shufflehi_epi16(z, 0b0001_1011); + z = _mm_shufflelo_epi16(z, 0b0001_1011); + _mm_packus_epi16(y, z) + } +} +impl BSwap for u32x4_sse2 { + #[inline(always)] + fn bswap(self) -> Self { + Self::new(bswap32_s2(self.x)) + } +} + +impl BSwap for u64x2_sse2 { + #[inline(always)] + fn bswap(self) -> Self { + Self::new(unsafe { + let k = _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607); + _mm_shuffle_epi8(self.x, k) + }) + } +} +impl BSwap for u64x2_sse2 { + #[inline(always)] + fn bswap(self) -> Self { + Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) }) + } +} + +impl BSwap for u128x1_sse2 { + #[inline(always)] + fn bswap(self) -> Self { + Self::new(unsafe { + let k = _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100); + _mm_shuffle_epi8(self.x, k) + }) + } +} +impl BSwap for u128x1_sse2 { + #[inline(always)] + fn bswap(self) -> Self { + Self::new(unsafe { unimplemented!() }) + } +} + +macro_rules! swapi { + ($x:expr, $i:expr, $k:expr) => { + unsafe { + const K: u8 = $k; + let k = _mm_set1_epi8(K as i8); + u128x1_sse2::new(_mm_or_si128( + _mm_srli_epi16(_mm_and_si128($x.x, k), $i), + _mm_and_si128(_mm_slli_epi16($x.x, $i), k), + )) + } + }; +} +#[inline(always)] +fn swap16_s2(x: __m128i) -> __m128i { + unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) } +} +impl Swap64 for u128x1_sse2 { + #[inline(always)] + fn swap1(self) -> Self { + swapi!(self, 1, 0xaa) + } + #[inline(always)] + fn swap2(self) -> Self { + swapi!(self, 2, 0xcc) + } + #[inline(always)] + fn swap4(self) -> Self { + swapi!(self, 4, 0xf0) + } + #[inline(always)] + fn swap8(self) -> Self { + u128x1_sse2::new(unsafe { + let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001); + _mm_shuffle_epi8(self.x, k) + }) + } + #[inline(always)] + fn swap16(self) -> Self { + u128x1_sse2::new(unsafe { + let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302); + _mm_shuffle_epi8(self.x, k) + }) + } + #[inline(always)] + fn swap32(self) -> Self { + u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) }) + } + #[inline(always)] + fn swap64(self) -> Self { + u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) }) + } +} +impl Swap64 for u128x1_sse2 { + #[inline(always)] + fn swap1(self) -> Self { + swapi!(self, 1, 0xaa) + } + #[inline(always)] + fn swap2(self) -> Self { + swapi!(self, 2, 0xcc) + } + #[inline(always)] + fn swap4(self) -> Self { + swapi!(self, 4, 0xf0) + } + #[inline(always)] + fn swap8(self) -> Self { + u128x1_sse2::new(unsafe { + _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8)) + }) + } + #[inline(always)] + fn swap16(self) -> Self { + u128x1_sse2::new(swap16_s2(self.x)) + } + #[inline(always)] + fn swap32(self) -> Self { + u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) }) + } + #[inline(always)] + fn swap64(self) -> Self { + u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) }) + } +} + +#[derive(Copy, Clone)] +pub struct G0; +#[derive(Copy, Clone)] +pub struct G1; + +#[allow(non_camel_case_types)] +pub type u32x4x2_sse2 = x2, G0>; +#[allow(non_camel_case_types)] +pub type u64x2x2_sse2 = x2, G0>; +#[allow(non_camel_case_types)] +pub type u64x4_sse2 = x2, G1>; +#[allow(non_camel_case_types)] +pub type u128x2_sse2 = x2, G0>; + +#[allow(non_camel_case_types)] +pub type u32x4x4_sse2 = x4>; +#[allow(non_camel_case_types)] +pub type u64x2x4_sse2 = x4>; +#[allow(non_camel_case_types)] +pub type u128x4_sse2 = x4>; + +impl u32x4x2> for u32x4x2_sse2 +where + u32x4_sse2: RotateEachWord32 + BSwap, + Machine86: Machine, + u32x4x2_sse2: MultiLane<[ as Machine>::u32x4; 2]>, + u32x4x2_sse2: Vec2< as Machine>::u32x4>, +{ +} +impl u64x2x2> for u64x2x2_sse2 +where + u64x2_sse2: RotateEachWord64 + RotateEachWord32 + BSwap, + Machine86: Machine, + u64x2x2_sse2: MultiLane<[ as Machine>::u64x2; 2]>, + u64x2x2_sse2: Vec2< as Machine>::u64x2>, +{ +} +impl u64x4> for u64x4_sse2 +where + u64x2_sse2: RotateEachWord64 + RotateEachWord32 + BSwap, + Machine86: Machine, + u64x4_sse2: MultiLane<[u64; 4]> + Vec4 + Words4, +{ +} +impl u128x2> for u128x2_sse2 +where + u128x1_sse2: Swap64 + BSwap, + Machine86: Machine, + u128x2_sse2: MultiLane<[ as Machine>::u128x1; 2]>, + u128x2_sse2: Vec2< as Machine>::u128x1>, + u128x2_sse2: Into< as Machine>::u32x4x2>, + u128x2_sse2: Into< as Machine>::u64x2x2>, + u128x2_sse2: Into< as Machine>::u64x4>, +{ +} + +impl u32x4x2> for u32x4x2_sse2 +where + u32x4_sse2: RotateEachWord32 + BSwap, + Avx2Machine: Machine, + u32x4x2_sse2: MultiLane<[ as Machine>::u32x4; 2]>, + u32x4x2_sse2: Vec2< as Machine>::u32x4>, +{ +} +impl u64x2x2> for u64x2x2_sse2 +where + u64x2_sse2: RotateEachWord64 + RotateEachWord32 + BSwap, + Avx2Machine: Machine, + u64x2x2_sse2: MultiLane<[ as Machine>::u64x2; 2]>, + u64x2x2_sse2: Vec2< as Machine>::u64x2>, +{ +} +impl u64x4> for u64x4_sse2 +where + u64x2_sse2: RotateEachWord64 + RotateEachWord32 + BSwap, + Avx2Machine: Machine, + u64x4_sse2: MultiLane<[u64; 4]> + Vec4 + Words4, +{ +} +impl u128x2> for u128x2_sse2 +where + u128x1_sse2: Swap64 + BSwap, + Avx2Machine: Machine, + u128x2_sse2: MultiLane<[ as Machine>::u128x1; 2]>, + u128x2_sse2: Vec2< as Machine>::u128x1>, + u128x2_sse2: Into< as Machine>::u32x4x2>, + u128x2_sse2: Into< as Machine>::u64x2x2>, + u128x2_sse2: Into< as Machine>::u64x4>, +{ +} + +impl Vec4 for u64x4_sse2 +where + u64x2_sse2: Copy + Vec2, +{ + #[inline(always)] + fn extract(self, i: u32) -> u64 { + match i { + 0 => self.0[0].extract(0), + 1 => self.0[0].extract(1), + 2 => self.0[1].extract(0), + 3 => self.0[1].extract(1), + _ => panic!(), + } + } + #[inline(always)] + fn insert(mut self, w: u64, i: u32) -> Self { + match i { + 0 => self.0[0] = self.0[0].insert(w, 0), + 1 => self.0[0] = self.0[0].insert(w, 1), + 2 => self.0[1] = self.0[1].insert(w, 0), + 3 => self.0[1] = self.0[1].insert(w, 1), + _ => panic!(), + }; + self + } +} + +impl u32x4x4> for u32x4x4_sse2 +where + u32x4_sse2: RotateEachWord32 + BSwap, + Machine86: Machine, + u32x4x4_sse2: MultiLane<[ as Machine>::u32x4; 4]>, + u32x4x4_sse2: Vec4< as Machine>::u32x4>, +{ +} +impl u64x2x4> for u64x2x4_sse2 +where + u64x2_sse2: RotateEachWord64 + RotateEachWord32 + BSwap, + Machine86: Machine, + u64x2x4_sse2: MultiLane<[ as Machine>::u64x2; 4]>, + u64x2x4_sse2: Vec4< as Machine>::u64x2>, +{ +} +impl u128x4> for u128x4_sse2 +where + u128x1_sse2: Swap64 + BSwap, + Machine86: Machine, + u128x4_sse2: MultiLane<[ as Machine>::u128x1; 4]>, + u128x4_sse2: Vec4< as Machine>::u128x1>, + u128x4_sse2: Into< as Machine>::u32x4x4>, + u128x4_sse2: Into< as Machine>::u64x2x4>, +{ +} + +impl u32x4x4> for u32x4x4_sse2 +where + u32x4_sse2: RotateEachWord32 + BSwap, + Avx2Machine: Machine, + u32x4x4_sse2: MultiLane<[ as Machine>::u32x4; 4]>, + u32x4x4_sse2: Vec4< as Machine>::u32x4>, +{ +} +impl u64x2x4> for u64x2x4_sse2 +where + u64x2_sse2: RotateEachWord64 + RotateEachWord32 + BSwap, + Avx2Machine: Machine, + u64x2x4_sse2: MultiLane<[ as Machine>::u64x2; 4]>, + u64x2x4_sse2: Vec4< as Machine>::u64x2>, +{ +} +impl u128x4> for u128x4_sse2 +where + u128x1_sse2: Swap64 + BSwap, + Avx2Machine: Machine, + u128x4_sse2: MultiLane<[ as Machine>::u128x1; 4]>, + u128x4_sse2: Vec4< as Machine>::u128x1>, + u128x4_sse2: Into< as Machine>::u32x4x4>, + u128x4_sse2: Into< as Machine>::u64x2x4>, +{ +} + +macro_rules! impl_into_x { + ($from:ident, $to:ident) => { + impl From, Gf>> + for x2<$to, Gt> + { + #[inline(always)] + fn from(x: x2<$from, Gf>) -> Self { + x2::new([$to::from(x.0[0]), $to::from(x.0[1])]) + } + } + impl From>> for x4<$to> { + #[inline(always)] + fn from(x: x4<$from>) -> Self { + x4::new([ + $to::from(x.0[0]), + $to::from(x.0[1]), + $to::from(x.0[2]), + $to::from(x.0[3]), + ]) + } + } + }; +} +impl_into_x!(u128x1_sse2, u64x2_sse2); +impl_into_x!(u128x1_sse2, u32x4_sse2); + +///// Debugging + +use core::fmt::{Debug, Formatter, Result}; + +impl PartialEq for x2 { + #[inline(always)] + fn eq(&self, rhs: &Self) -> bool { + self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1] + } +} + +#[inline(always)] +unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool { + let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110); + _mm_cvtsi128_si64(q) == -1 +} + +#[inline(always)] +unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool { + let q = _mm_cmpeq_epi32(x, y); + let p = _mm_cvtsi128_si64(_mm_srli_si128(q, 8)); + let q = _mm_cvtsi128_si64(q); + (p & q) == -1 +} + +impl PartialEq for u32x4_sse2 { + #[inline(always)] + fn eq(&self, rhs: &Self) -> bool { + unsafe { eq128_s2(self.x, rhs.x) } + } +} +impl Debug for u32x4_sse2 +where + Self: Copy + MultiLane<[u32; 4]>, +{ + #[cold] + fn fmt(&self, fmt: &mut Formatter) -> Result { + fmt.write_fmt(format_args!("{:08x?}", &self.to_lanes())) + } +} + +impl PartialEq for u64x2_sse2 { + #[inline(always)] + fn eq(&self, rhs: &Self) -> bool { + unsafe { eq128_s2(self.x, rhs.x) } + } +} +impl Debug for u64x2_sse2 +where + Self: Copy + MultiLane<[u64; 2]>, +{ + #[cold] + fn fmt(&self, fmt: &mut Formatter) -> Result { + fmt.write_fmt(format_args!("{:016x?}", &self.to_lanes())) + } +} + +impl Debug for u64x4_sse2 +where + u64x2_sse2: Copy + MultiLane<[u64; 2]>, +{ + #[cold] + fn fmt(&self, fmt: &mut Formatter) -> Result { + let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes()); + fmt.write_fmt(format_args!("{:016x?}", &[a[0], a[1], b[0], b[1]])) + } +} + +#[cfg(test)] +#[cfg(target_arch = "x86_64")] +mod test { + use super::*; + use crate::x86_64::{SSE2, SSE41, SSSE3}; + use crate::Machine; + + #[test] + #[cfg_attr(not(target_feature = "ssse3"), ignore)] + fn test_bswap32_s2_vs_s3() { + let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100]; + let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203]; + + let s2 = unsafe { SSE2::instance() }; + let s3 = unsafe { SSSE3::instance() }; + + let x_s2 = { + let x_s2: ::u32x4 = s2.vec(xs); + x_s2.bswap() + }; + + let x_s3 = { + let x_s3: ::u32x4 = s3.vec(xs); + x_s3.bswap() + }; + + assert_eq!(x_s2, unsafe { core::mem::transmute(x_s3) }); + assert_eq!(x_s2, s2.vec(ys)); + } + + #[test] + #[cfg_attr(not(target_feature = "ssse3"), ignore)] + fn test_bswap64_s2_vs_s3() { + let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100]; + let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607]; + + let s2 = unsafe { SSE2::instance() }; + let s3 = unsafe { SSSE3::instance() }; + + let x_s2 = { + let x_s2: ::u64x2 = s2.vec(xs); + x_s2.bswap() + }; + + let x_s3 = { + let x_s3: ::u64x2 = s3.vec(xs); + x_s3.bswap() + }; + + assert_eq!(x_s2, s2.vec(ys)); + assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + } + + #[test] + #[cfg_attr(not(target_feature = "ssse3"), ignore)] + fn test_shuffle32_s2_vs_s3() { + let xs = [0x0, 0x1, 0x2, 0x3]; + let ys = [0x2, 0x3, 0x0, 0x1]; + let zs = [0x1, 0x2, 0x3, 0x0]; + + let s2 = unsafe { SSE2::instance() }; + let s3 = unsafe { SSSE3::instance() }; + + let x_s2 = { + let x_s2: ::u32x4 = s2.vec(xs); + x_s2.shuffle2301() + }; + let x_s3 = { + let x_s3: ::u32x4 = s3.vec(xs); + x_s3.shuffle2301() + }; + assert_eq!(x_s2, s2.vec(ys)); + assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + + let x_s2 = { + let x_s2: ::u32x4 = s2.vec(xs); + x_s2.shuffle3012() + }; + let x_s3 = { + let x_s3: ::u32x4 = s3.vec(xs); + x_s3.shuffle3012() + }; + assert_eq!(x_s2, s2.vec(zs)); + assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + + let x_s2 = x_s2.shuffle1230(); + let x_s3 = x_s3.shuffle1230(); + assert_eq!(x_s2, s2.vec(xs)); + assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + } + + #[test] + #[cfg_attr(not(target_feature = "ssse3"), ignore)] + fn test_shuffle64_s2_vs_s3() { + let xs = [0x0, 0x1, 0x2, 0x3]; + let ys = [0x2, 0x3, 0x0, 0x1]; + let zs = [0x1, 0x2, 0x3, 0x0]; + + let s2 = unsafe { SSE2::instance() }; + let s3 = unsafe { SSSE3::instance() }; + + let x_s2 = { + let x_s2: ::u64x4 = s2.vec(xs); + x_s2.shuffle2301() + }; + let x_s3 = { + let x_s3: ::u64x4 = s3.vec(xs); + x_s3.shuffle2301() + }; + assert_eq!(x_s2, s2.vec(ys)); + assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + + let x_s2 = { + let x_s2: ::u64x4 = s2.vec(xs); + x_s2.shuffle3012() + }; + let x_s3 = { + let x_s3: ::u64x4 = s3.vec(xs); + x_s3.shuffle3012() + }; + assert_eq!(x_s2, s2.vec(zs)); + assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + + let x_s2 = x_s2.shuffle1230(); + let x_s3 = x_s3.shuffle1230(); + assert_eq!(x_s2, s2.vec(xs)); + assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + } + + #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)] + #[test] + fn test_lanes_u32x4() { + let xs = [0x1, 0x2, 0x3, 0x4]; + + let s2 = unsafe { SSE2::instance() }; + let s3 = unsafe { SSSE3::instance() }; + let s4 = unsafe { SSE41::instance() }; + + { + let x_s2: ::u32x4 = s2.vec(xs); + let y_s2 = ::u32x4::from_lanes(xs); + assert_eq!(x_s2, y_s2); + assert_eq!(xs, y_s2.to_lanes()); + } + + { + let x_s3: ::u32x4 = s3.vec(xs); + let y_s3 = ::u32x4::from_lanes(xs); + assert_eq!(x_s3, y_s3); + assert_eq!(xs, y_s3.to_lanes()); + } + + { + let x_s4: ::u32x4 = s4.vec(xs); + let y_s4 = ::u32x4::from_lanes(xs); + assert_eq!(x_s4, y_s4); + assert_eq!(xs, y_s4.to_lanes()); + } + } + + #[test] + #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)] + fn test_lanes_u64x2() { + let xs = [0x1, 0x2]; + + let s2 = unsafe { SSE2::instance() }; + let s3 = unsafe { SSSE3::instance() }; + let s4 = unsafe { SSE41::instance() }; + + { + let x_s2: ::u64x2 = s2.vec(xs); + let y_s2 = ::u64x2::from_lanes(xs); + assert_eq!(x_s2, y_s2); + assert_eq!(xs, y_s2.to_lanes()); + } + + { + let x_s3: ::u64x2 = s3.vec(xs); + let y_s3 = ::u64x2::from_lanes(xs); + assert_eq!(x_s3, y_s3); + assert_eq!(xs, y_s3.to_lanes()); + } + + { + let x_s4: ::u64x2 = s4.vec(xs); + let y_s4 = ::u64x2::from_lanes(xs); + assert_eq!(x_s4, y_s4); + assert_eq!(xs, y_s4.to_lanes()); + } + } + + #[test] + fn test_vec4_u32x4_s2() { + let xs = [1, 2, 3, 4]; + let s2 = unsafe { SSE2::instance() }; + let x_s2: ::u32x4 = s2.vec(xs); + assert_eq!(x_s2.extract(0), 1); + assert_eq!(x_s2.extract(1), 2); + assert_eq!(x_s2.extract(2), 3); + assert_eq!(x_s2.extract(3), 4); + assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4])); + assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4])); + assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4])); + assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf])); + } + + #[test] + #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)] + fn test_vec4_u32x4_s4() { + let xs = [1, 2, 3, 4]; + let s4 = unsafe { SSE41::instance() }; + let x_s4: ::u32x4 = s4.vec(xs); + assert_eq!(x_s4.extract(0), 1); + assert_eq!(x_s4.extract(1), 2); + assert_eq!(x_s4.extract(2), 3); + assert_eq!(x_s4.extract(3), 4); + assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4])); + assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4])); + assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4])); + assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf])); + } + + #[test] + fn test_vec2_u64x2_s2() { + let xs = [0x1, 0x2]; + let s2 = unsafe { SSE2::instance() }; + let x_s2: ::u64x2 = s2.vec(xs); + assert_eq!(x_s2.extract(0), 1); + assert_eq!(x_s2.extract(1), 2); + assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2])); + assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf])); + } + + #[test] + #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)] + fn test_vec4_u64x2_s4() { + let xs = [0x1, 0x2]; + let s4 = unsafe { SSE41::instance() }; + let x_s4: ::u64x2 = s4.vec(xs); + assert_eq!(x_s4.extract(0), 1); + assert_eq!(x_s4.extract(1), 2); + assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2])); + assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf])); + } +} + +pub mod avx2 { + #![allow(non_camel_case_types)] + use crate::soft::x4; + use crate::types::*; + use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2}; + use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4}; + use core::arch::x86_64::*; + use core::marker::PhantomData; + use core::ops::*; + + #[derive(Copy, Clone)] + pub struct u32x4x4_avx2 { + x: [__m256i; 2], + ni: PhantomData, + } + + impl u32x4x4_avx2 { + #[inline(always)] + fn new(x: [__m256i; 2]) -> Self { + Self { x, ni: PhantomData } + } + } + + impl u32x4x4> for u32x4x4_avx2 where NI: Copy {} + impl Store for u32x4x4_avx2 { + #[inline(always)] + unsafe fn unpack(p: vec512_storage) -> Self { + Self::new([p.avx[0].avx, p.avx[1].avx]) + } + } + impl MultiLane<[u32x4_sse2; 4]> for u32x4x4_avx2 { + #[inline(always)] + fn to_lanes(self) -> [u32x4_sse2; 4] { + unsafe { + [ + u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)), + u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)), + u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)), + u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)), + ] + } + } + #[inline(always)] + fn from_lanes(x: [u32x4_sse2; 4]) -> Self { + Self::new(unsafe { + [ + _mm256_setr_m128i(x[0].x, x[1].x), + _mm256_setr_m128i(x[2].x, x[3].x), + ] + }) + } + } + impl Vec4> for u32x4x4_avx2 { + #[inline(always)] + fn extract(self, i: u32) -> u32x4_sse2 { + unsafe { + match i { + 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)), + 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)), + 2 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)), + 3 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)), + _ => panic!(), + } + } + } + #[inline(always)] + fn insert(self, w: u32x4_sse2, i: u32) -> Self { + Self::new(unsafe { + match i { + 0 => [_mm256_inserti128_si256(self.x[0], w.x, 0), self.x[1]], + 1 => [_mm256_inserti128_si256(self.x[0], w.x, 1), self.x[1]], + 2 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 0)], + 3 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 1)], + _ => panic!(), + } + }) + } + } + impl LaneWords4 for u32x4x4_avx2 { + #[inline(always)] + fn shuffle_lane_words1230(self) -> Self { + Self::new(unsafe { + [ + _mm256_shuffle_epi32(self.x[0], 0b1001_0011), + _mm256_shuffle_epi32(self.x[1], 0b1001_0011), + ] + }) + } + #[inline(always)] + fn shuffle_lane_words2301(self) -> Self { + Self::new(unsafe { + [ + _mm256_shuffle_epi32(self.x[0], 0b0100_1110), + _mm256_shuffle_epi32(self.x[1], 0b0100_1110), + ] + }) + } + #[inline(always)] + fn shuffle_lane_words3012(self) -> Self { + Self::new(unsafe { + [ + _mm256_shuffle_epi32(self.x[0], 0b0011_1001), + _mm256_shuffle_epi32(self.x[1], 0b0011_1001), + ] + }) + } + } + impl BitOps32 for u32x4x4_avx2 where NI: Copy {} + impl ArithOps for u32x4x4_avx2 where NI: Copy {} + macro_rules! shuf_lane_bytes { + ($name:ident, $k0:expr, $k1:expr) => { + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + [ + _mm256_shuffle_epi8( + self.x[0], + _mm256_set_epi64x($k0, $k1, $k0, $k1), + ), + _mm256_shuffle_epi8( + self.x[1], + _mm256_set_epi64x($k0, $k1, $k0, $k1), + ) + ] + }) + } + }; + } + macro_rules! rotr_32 { + ($name:ident, $i:expr) => { + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + [ + _mm256_or_si256( + _mm256_srli_epi32(self.x[0], $i as i32), + _mm256_slli_epi32(self.x[0], 32 - $i as i32), + ), + _mm256_or_si256( + _mm256_srli_epi32(self.x[1], $i as i32), + _mm256_slli_epi32(self.x[1], 32 - $i as i32), + ) + ] + }) + } + }; + } + impl RotateEachWord32 for u32x4x4_avx2 { + rotr_32!(rotate_each_word_right7, 7); + shuf_lane_bytes!( + rotate_each_word_right8, + 0x0c0f0e0d_080b0a09, + 0x04070605_00030201 + ); + rotr_32!(rotate_each_word_right11, 11); + rotr_32!(rotate_each_word_right12, 12); + shuf_lane_bytes!( + rotate_each_word_right16, + 0x0d0c0f0e_09080b0a, + 0x05040706_01000302 + ); + rotr_32!(rotate_each_word_right20, 20); + shuf_lane_bytes!( + rotate_each_word_right24, + 0x0e0d0c0f_0a09080b, + 0x06050407_02010003 + ); + rotr_32!(rotate_each_word_right25, 25); + } + impl BitOps0 for u32x4x4_avx2 where NI: Copy {} + impl From> for vec512_storage { + #[inline(always)] + fn from(x: u32x4x4_avx2) -> Self { + Self { + avx: [ + vec256_storage { avx: x.x[0] }, + vec256_storage { avx: x.x[1] }, + ], + } + } + } + + macro_rules! impl_assign { + ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => { + impl $Assign for $vec + where + NI: Copy, + { + #[inline(always)] + fn $assign_fn(&mut self, rhs: Self) { + *self = self.$bin_fn(rhs); + } + } + }; + } + impl_assign!(u32x4x4_avx2, BitXorAssign, bitxor_assign, bitxor); + impl_assign!(u32x4x4_avx2, BitOrAssign, bitor_assign, bitor); + impl_assign!(u32x4x4_avx2, BitAndAssign, bitand_assign, bitand); + impl_assign!(u32x4x4_avx2, AddAssign, add_assign, add); + + macro_rules! impl_bitop_x2 { + ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => { + impl $Op for $vec { + type Output = Self; + #[inline(always)] + fn $op_fn(self, rhs: Self) -> Self::Output { + Self::new(unsafe { + [$impl_fn(self.x[0], rhs.x[0]), $impl_fn(self.x[1], rhs.x[1])] + }) + } + } + }; + } + impl_bitop_x2!(u32x4x4_avx2, BitXor, bitxor, _mm256_xor_si256); + impl_bitop_x2!(u32x4x4_avx2, BitOr, bitor, _mm256_or_si256); + impl_bitop_x2!(u32x4x4_avx2, BitAnd, bitand, _mm256_and_si256); + impl_bitop_x2!(u32x4x4_avx2, AndNot, andnot, _mm256_andnot_si256); + impl_bitop_x2!(u32x4x4_avx2, Add, add, _mm256_add_epi32); + + impl Not for u32x4x4_avx2 { + type Output = Self; + #[inline(always)] + fn not(self) -> Self::Output { + unsafe { + let f = _mm256_set1_epi8(-0x7f); + Self::new([f, f]) ^ self + } + } + } + + impl BSwap for u32x4x4_avx2 { + shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203); + } + + impl From>> for u32x4x4_avx2 + where + NI: Copy, + { + #[inline(always)] + fn from(x: x4>) -> Self { + Self::new(unsafe { + [ + _mm256_setr_m128i(x.0[0].x, x.0[1].x), + _mm256_setr_m128i(x.0[2].x, x.0[3].x), + ] + }) + } + } +} -- cgit v1.2.3