// crate minimums: sse2, x86_64 use core::arch::x86_64::{__m128i, __m256i}; use crate::types::*; mod sse2; #[derive(Copy, Clone)] pub struct YesS3; #[derive(Copy, Clone)] pub struct NoS3; #[derive(Copy, Clone)] pub struct YesS4; #[derive(Copy, Clone)] pub struct NoS4; #[derive(Copy, Clone)] pub struct YesA1; #[derive(Copy, Clone)] pub struct NoA1; #[derive(Copy, Clone)] pub struct YesA2; #[derive(Copy, Clone)] pub struct NoA2; #[derive(Copy, Clone)] pub struct YesNI; #[derive(Copy, Clone)] pub struct NoNI; use core::marker::PhantomData; #[derive(Copy, Clone)] pub struct SseMachine(PhantomData<(S3, S4, NI)>); impl Machine for SseMachine where sse2::u128x1_sse2: Swap64, sse2::u64x2_sse2: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2, sse2::u32x4_sse2: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4, sse2::u64x4_sse2: BSwap + Words4, sse2::u128x1_sse2: BSwap, sse2::u128x2_sse2: Into>, sse2::u128x2_sse2: Into>, sse2::u128x2_sse2: Into>, sse2::u128x4_sse2: Into>, sse2::u128x4_sse2: Into>, { type u32x4 = sse2::u32x4_sse2; type u64x2 = sse2::u64x2_sse2; type u128x1 = sse2::u128x1_sse2; type u32x4x2 = sse2::u32x4x2_sse2; type u64x2x2 = sse2::u64x2x2_sse2; type u64x4 = sse2::u64x4_sse2; type u128x2 = sse2::u128x2_sse2; type u32x4x4 = sse2::u32x4x4_sse2; type u64x2x4 = sse2::u64x2x4_sse2; type u128x4 = sse2::u128x4_sse2; #[inline(always)] unsafe fn instance() -> Self { SseMachine(PhantomData) } } #[derive(Copy, Clone)] pub struct Avx2Machine(PhantomData); impl Machine for Avx2Machine where sse2::u128x1_sse2: BSwap + Swap64, sse2::u64x2_sse2: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2, sse2::u32x4_sse2: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4, sse2::u64x4_sse2: BSwap + Words4, { type u32x4 = sse2::u32x4_sse2; type u64x2 = sse2::u64x2_sse2; type u128x1 = sse2::u128x1_sse2; type u32x4x2 = sse2::u32x4x2_sse2; type u64x2x2 = sse2::u64x2x2_sse2; type u64x4 = sse2::u64x4_sse2; type u128x2 = sse2::u128x2_sse2; type u32x4x4 = sse2::avx2::u32x4x4_avx2; type u64x2x4 = sse2::u64x2x4_sse2; type u128x4 = sse2::u128x4_sse2; #[inline(always)] unsafe fn instance() -> Self { Avx2Machine(PhantomData) } } pub type SSE2 = SseMachine; pub type SSSE3 = SseMachine; pub type SSE41 = SseMachine; /// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything /// to avoid expensive SSE/VEX conflicts. pub type AVX = SseMachine; pub type AVX2 = Avx2Machine; /// Generic wrapper for unparameterized storage of any of the possible impls. /// Converting into and out of this type should be essentially free, although it may be more /// aligned than a particular impl requires. #[allow(non_camel_case_types)] #[derive(Copy, Clone)] pub union vec128_storage { u32x4: [u32; 4], u64x2: [u64; 2], u128x1: [u128; 1], sse2: __m128i, } impl Store for vec128_storage { #[inline(always)] unsafe fn unpack(p: vec128_storage) -> Self { p } } impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage { #[inline(always)] fn into(self) -> &'a [u32; 4] { unsafe { &self.u32x4 } } } impl Into for [u32; 4] { #[inline(always)] fn into(self) -> vec128_storage { vec128_storage { u32x4: self } } } impl Default for vec128_storage { #[inline(always)] fn default() -> Self { vec128_storage { u128x1: [0] } } } impl Eq for vec128_storage {} impl PartialEq for vec128_storage { #[inline(always)] fn eq(&self, rhs: &Self) -> bool { unsafe { self.u128x1 == rhs.u128x1 } } } #[allow(non_camel_case_types)] #[derive(Copy, Clone)] pub union vec256_storage { u32x8: [u32; 8], u64x4: [u64; 4], u128x2: [u128; 2], sse2: [vec128_storage; 2], avx: __m256i, } impl Into for [u64; 4] { #[inline(always)] fn into(self) -> vec256_storage { vec256_storage { u64x4: self } } } impl Default for vec256_storage { #[inline(always)] fn default() -> Self { vec256_storage { u128x2: [0, 0] } } } impl vec256_storage { pub fn new128(xs: [vec128_storage; 2]) -> Self { Self { sse2: xs } } pub fn split128(self) -> [vec128_storage; 2] { unsafe { self.sse2 } } } impl Eq for vec256_storage {} impl PartialEq for vec256_storage { #[inline(always)] fn eq(&self, rhs: &Self) -> bool { unsafe { self.sse2 == rhs.sse2 } } } #[allow(non_camel_case_types)] #[derive(Copy, Clone)] pub union vec512_storage { u32x16: [u32; 16], u64x8: [u64; 8], u128x4: [u128; 4], sse2: [vec128_storage; 4], avx: [vec256_storage; 2], } impl Default for vec512_storage { #[inline(always)] fn default() -> Self { vec512_storage { u128x4: [0, 0, 0, 0], } } } impl vec512_storage { pub fn new128(xs: [vec128_storage; 4]) -> Self { Self { sse2: xs } } pub fn split128(self) -> [vec128_storage; 4] { unsafe { self.sse2 } } } impl Eq for vec512_storage {} impl PartialEq for vec512_storage { #[inline(always)] fn eq(&self, rhs: &Self) -> bool { unsafe { self.avx == rhs.avx } } } macro_rules! impl_into { ($storage:ident, $array:ty, $name:ident) => { impl Into<$array> for $storage { #[inline(always)] fn into(self) -> $array { unsafe { self.$name } } } }; } impl_into!(vec128_storage, [u32; 4], u32x4); impl_into!(vec128_storage, [u64; 2], u64x2); impl_into!(vec128_storage, [u128; 1], u128x1); impl_into!(vec256_storage, [u32; 8], u32x8); impl_into!(vec256_storage, [u64; 4], u64x4); impl_into!(vec256_storage, [u128; 2], u128x2); impl_into!(vec512_storage, [u32; 16], u32x16); impl_into!(vec512_storage, [u64; 8], u64x8); impl_into!(vec512_storage, [u128; 4], u128x4); /// Generate the full set of optimized implementations to take advantage of the most important /// hardware feature sets. /// /// This dispatcher is suitable for maximizing throughput. #[macro_export] macro_rules! dispatch { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { #[cfg(feature = "std")] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { #[inline(always)] fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body use std::arch::x86_64::*; #[target_feature(enable = "avx2")] unsafe fn impl_avx2($($arg: $argty),*) -> $ret { let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*); _mm256_zeroupper(); ret } #[target_feature(enable = "avx")] #[target_feature(enable = "sse4.1")] #[target_feature(enable = "ssse3")] unsafe fn impl_avx($($arg: $argty),*) -> $ret { let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*); _mm256_zeroupper(); ret } #[target_feature(enable = "sse4.1")] #[target_feature(enable = "ssse3")] unsafe fn impl_sse41($($arg: $argty),*) -> $ret { fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) } #[target_feature(enable = "ssse3")] unsafe fn impl_ssse3($($arg: $argty),*) -> $ret { fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) } #[target_feature(enable = "sse2")] unsafe fn impl_sse2($($arg: $argty),*) -> $ret { fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) } unsafe { if is_x86_feature_detected!("avx2") { impl_avx2($($arg),*) } else if is_x86_feature_detected!("avx") { impl_avx($($arg),*) } else if is_x86_feature_detected!("sse4.1") { impl_sse41($($arg),*) } else if is_x86_feature_detected!("ssse3") { impl_ssse3($($arg),*) } else if is_x86_feature_detected!("sse2") { impl_sse2($($arg),*) } else { unimplemented!() } } } #[cfg(not(feature = "std"))] #[inline(always)] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body unsafe { if cfg!(target_feature = "avx2") { fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) } else if cfg!(target_feature = "avx") { fn_impl($crate::x86_64::AVX::instance(), $($arg),*) } else if cfg!(target_feature = "sse4.1") { fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) } else if cfg!(target_feature = "ssse3") { fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) } else { fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) } } } }; ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { dispatch!($mach, $MTy, { $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body }); } } /// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit /// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX. /// /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware /// features (e.g. because they are done infrequently), so minimizing their contribution to code /// size is more important. #[macro_export] macro_rules! dispatch_light128 { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { #[cfg(feature = "std")] $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret { #[inline(always)] fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body use std::arch::x86_64::*; #[target_feature(enable = "avx")] unsafe fn impl_avx($($arg: $argty),*) -> $ret { fn_impl($crate::x86_64::AVX::instance(), $($arg),*) } #[target_feature(enable = "sse2")] unsafe fn impl_sse2($($arg: $argty),*) -> $ret { fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) } unsafe { if is_x86_feature_detected!("avx") { impl_avx($($arg),*) } else if is_x86_feature_detected!("sse2") { impl_sse2($($arg),*) } else { unimplemented!() } } } #[cfg(not(feature = "std"))] #[inline(always)] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body unsafe { if cfg!(target_feature = "avx2") { fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) } else if cfg!(target_feature = "avx") { fn_impl($crate::x86_64::AVX::instance(), $($arg),*) } else if cfg!(target_feature = "sse4.1") { fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) } else if cfg!(target_feature = "ssse3") { fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) } else { fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) } } } }; ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { dispatch_light128!($mach, $MTy, { $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body }); } } /// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit /// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2. /// /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware /// features (e.g. because they are done infrequently), so minimizing their contribution to code /// size is more important. #[macro_export] macro_rules! dispatch_light256 { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { #[cfg(feature = "std")] $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret { #[inline(always)] fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body use std::arch::x86_64::*; #[target_feature(enable = "avx")] unsafe fn impl_avx($($arg: $argty),*) -> $ret { fn_impl($crate::x86_64::AVX::instance(), $($arg),*) } #[target_feature(enable = "sse2")] unsafe fn impl_sse2($($arg: $argty),*) -> $ret { fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) } unsafe { if is_x86_feature_detected!("avx") { impl_avx($($arg),*) } else if is_x86_feature_detected!("sse2") { impl_sse2($($arg),*) } else { unimplemented!() } } } #[cfg(not(feature = "std"))] #[inline(always)] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body unsafe { if cfg!(target_feature = "avx2") { fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) } else if cfg!(target_feature = "avx") { fn_impl($crate::x86_64::AVX::instance(), $($arg),*) } else if cfg!(target_feature = "sse4.1") { fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) } else if cfg!(target_feature = "ssse3") { fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) } else { fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) } } } }; ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { dispatch_light256!($mach, $MTy, { $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body }); } }