summaryrefslogtreecommitdiffstats
path: root/vendor/ppv-lite86/src/x86_64/mod.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/ppv-lite86/src/x86_64/mod.rs')
-rw-r--r--vendor/ppv-lite86/src/x86_64/mod.rs433
1 files changed, 433 insertions, 0 deletions
diff --git a/vendor/ppv-lite86/src/x86_64/mod.rs b/vendor/ppv-lite86/src/x86_64/mod.rs
new file mode 100644
index 000000000..ecf184f36
--- /dev/null
+++ b/vendor/ppv-lite86/src/x86_64/mod.rs
@@ -0,0 +1,433 @@
+// crate minimums: sse2, x86_64
+
+use core::arch::x86_64::{__m128i, __m256i};
+use crate::types::*;
+
+mod sse2;
+
+#[derive(Copy, Clone)]
+pub struct YesS3;
+#[derive(Copy, Clone)]
+pub struct NoS3;
+
+#[derive(Copy, Clone)]
+pub struct YesS4;
+#[derive(Copy, Clone)]
+pub struct NoS4;
+
+#[derive(Copy, Clone)]
+pub struct YesA1;
+#[derive(Copy, Clone)]
+pub struct NoA1;
+
+#[derive(Copy, Clone)]
+pub struct YesA2;
+#[derive(Copy, Clone)]
+pub struct NoA2;
+
+#[derive(Copy, Clone)]
+pub struct YesNI;
+#[derive(Copy, Clone)]
+pub struct NoNI;
+
+use core::marker::PhantomData;
+
+#[derive(Copy, Clone)]
+pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>);
+impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI>
+where
+ sse2::u128x1_sse2<S3, S4, NI>: Swap64,
+ sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
+ sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
+ sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4,
+ sse2::u128x1_sse2<S3, S4, NI>: BSwap,
+ sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>,
+ sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>,
+ sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>,
+ sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>,
+ sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>,
+{
+ type u32x4 = sse2::u32x4_sse2<S3, S4, NI>;
+ type u64x2 = sse2::u64x2_sse2<S3, S4, NI>;
+ type u128x1 = sse2::u128x1_sse2<S3, S4, NI>;
+
+ type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>;
+ type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>;
+ type u64x4 = sse2::u64x4_sse2<S3, S4, NI>;
+ type u128x2 = sse2::u128x2_sse2<S3, S4, NI>;
+
+ type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>;
+ type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>;
+ type u128x4 = sse2::u128x4_sse2<S3, S4, NI>;
+
+ #[inline(always)]
+ unsafe fn instance() -> Self {
+ SseMachine(PhantomData)
+ }
+}
+
+#[derive(Copy, Clone)]
+pub struct Avx2Machine<NI>(PhantomData<NI>);
+impl<NI: Copy> Machine for Avx2Machine<NI>
+where
+ sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64,
+ sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
+ sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
+ sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4,
+{
+ type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>;
+ type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
+ type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;
+
+ type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>;
+ type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
+ type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
+ type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;
+
+ type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>;
+ type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>;
+ type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>;
+
+ #[inline(always)]
+ unsafe fn instance() -> Self {
+ Avx2Machine(PhantomData)
+ }
+}
+
+pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>;
+pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>;
+pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>;
+/// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything
+/// to avoid expensive SSE/VEX conflicts.
+pub type AVX = SseMachine<YesS3, YesS4, NoNI>;
+pub type AVX2 = Avx2Machine<NoNI>;
+
+/// Generic wrapper for unparameterized storage of any of the possible impls.
+/// Converting into and out of this type should be essentially free, although it may be more
+/// aligned than a particular impl requires.
+#[allow(non_camel_case_types)]
+#[derive(Copy, Clone)]
+pub union vec128_storage {
+ u32x4: [u32; 4],
+ u64x2: [u64; 2],
+ u128x1: [u128; 1],
+ sse2: __m128i,
+}
+impl Store<vec128_storage> for vec128_storage {
+ #[inline(always)]
+ unsafe fn unpack(p: vec128_storage) -> Self {
+ p
+ }
+}
+impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage {
+ #[inline(always)]
+ fn into(self) -> &'a [u32; 4] {
+ unsafe { &self.u32x4 }
+ }
+}
+impl Into<vec128_storage> for [u32; 4] {
+ #[inline(always)]
+ fn into(self) -> vec128_storage {
+ vec128_storage { u32x4: self }
+ }
+}
+impl Default for vec128_storage {
+ #[inline(always)]
+ fn default() -> Self {
+ vec128_storage { u128x1: [0] }
+ }
+}
+impl Eq for vec128_storage {}
+impl PartialEq for vec128_storage {
+ #[inline(always)]
+ fn eq(&self, rhs: &Self) -> bool {
+ unsafe { self.u128x1 == rhs.u128x1 }
+ }
+}
+
+#[allow(non_camel_case_types)]
+#[derive(Copy, Clone)]
+pub union vec256_storage {
+ u32x8: [u32; 8],
+ u64x4: [u64; 4],
+ u128x2: [u128; 2],
+ sse2: [vec128_storage; 2],
+ avx: __m256i,
+}
+impl Into<vec256_storage> for [u64; 4] {
+ #[inline(always)]
+ fn into(self) -> vec256_storage {
+ vec256_storage { u64x4: self }
+ }
+}
+impl Default for vec256_storage {
+ #[inline(always)]
+ fn default() -> Self {
+ vec256_storage { u128x2: [0, 0] }
+ }
+}
+impl vec256_storage {
+ pub fn new128(xs: [vec128_storage; 2]) -> Self {
+ Self { sse2: xs }
+ }
+ pub fn split128(self) -> [vec128_storage; 2] {
+ unsafe { self.sse2 }
+ }
+}
+impl Eq for vec256_storage {}
+impl PartialEq for vec256_storage {
+ #[inline(always)]
+ fn eq(&self, rhs: &Self) -> bool {
+ unsafe { self.sse2 == rhs.sse2 }
+ }
+}
+
+#[allow(non_camel_case_types)]
+#[derive(Copy, Clone)]
+pub union vec512_storage {
+ u32x16: [u32; 16],
+ u64x8: [u64; 8],
+ u128x4: [u128; 4],
+ sse2: [vec128_storage; 4],
+ avx: [vec256_storage; 2],
+}
+impl Default for vec512_storage {
+ #[inline(always)]
+ fn default() -> Self {
+ vec512_storage {
+ u128x4: [0, 0, 0, 0],
+ }
+ }
+}
+impl vec512_storage {
+ pub fn new128(xs: [vec128_storage; 4]) -> Self {
+ Self { sse2: xs }
+ }
+ pub fn split128(self) -> [vec128_storage; 4] {
+ unsafe { self.sse2 }
+ }
+}
+impl Eq for vec512_storage {}
+impl PartialEq for vec512_storage {
+ #[inline(always)]
+ fn eq(&self, rhs: &Self) -> bool {
+ unsafe { self.avx == rhs.avx }
+ }
+}
+
+macro_rules! impl_into {
+ ($storage:ident, $array:ty, $name:ident) => {
+ impl Into<$array> for $storage {
+ #[inline(always)]
+ fn into(self) -> $array {
+ unsafe { self.$name }
+ }
+ }
+ };
+}
+impl_into!(vec128_storage, [u32; 4], u32x4);
+impl_into!(vec128_storage, [u64; 2], u64x2);
+impl_into!(vec128_storage, [u128; 1], u128x1);
+impl_into!(vec256_storage, [u32; 8], u32x8);
+impl_into!(vec256_storage, [u64; 4], u64x4);
+impl_into!(vec256_storage, [u128; 2], u128x2);
+impl_into!(vec512_storage, [u32; 16], u32x16);
+impl_into!(vec512_storage, [u64; 8], u64x8);
+impl_into!(vec512_storage, [u128; 4], u128x4);
+
+/// Generate the full set of optimized implementations to take advantage of the most important
+/// hardware feature sets.
+///
+/// This dispatcher is suitable for maximizing throughput.
+#[macro_export]
+macro_rules! dispatch {
+ ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
+ #[cfg(feature = "std")]
+ $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+ #[inline(always)]
+ fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+ use std::arch::x86_64::*;
+ #[target_feature(enable = "avx2")]
+ unsafe fn impl_avx2($($arg: $argty),*) -> $ret {
+ let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*);
+ _mm256_zeroupper();
+ ret
+ }
+ #[target_feature(enable = "avx")]
+ #[target_feature(enable = "sse4.1")]
+ #[target_feature(enable = "ssse3")]
+ unsafe fn impl_avx($($arg: $argty),*) -> $ret {
+ let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*);
+ _mm256_zeroupper();
+ ret
+ }
+ #[target_feature(enable = "sse4.1")]
+ #[target_feature(enable = "ssse3")]
+ unsafe fn impl_sse41($($arg: $argty),*) -> $ret {
+ fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
+ }
+ #[target_feature(enable = "ssse3")]
+ unsafe fn impl_ssse3($($arg: $argty),*) -> $ret {
+ fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
+ }
+ #[target_feature(enable = "sse2")]
+ unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
+ fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
+ }
+ unsafe {
+ if is_x86_feature_detected!("avx2") {
+ impl_avx2($($arg),*)
+ } else if is_x86_feature_detected!("avx") {
+ impl_avx($($arg),*)
+ } else if is_x86_feature_detected!("sse4.1") {
+ impl_sse41($($arg),*)
+ } else if is_x86_feature_detected!("ssse3") {
+ impl_ssse3($($arg),*)
+ } else if is_x86_feature_detected!("sse2") {
+ impl_sse2($($arg),*)
+ } else {
+ unimplemented!()
+ }
+ }
+ }
+ #[cfg(not(feature = "std"))]
+ #[inline(always)]
+ $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+ unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+ unsafe {
+ if cfg!(target_feature = "avx2") {
+ fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
+ } else if cfg!(target_feature = "avx") {
+ fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
+ } else if cfg!(target_feature = "sse4.1") {
+ fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
+ } else if cfg!(target_feature = "ssse3") {
+ fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
+ } else {
+ fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
+ }
+ }
+ }
+ };
+ ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
+ dispatch!($mach, $MTy, {
+ $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
+ });
+ }
+}
+
+/// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit
+/// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX.
+///
+/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
+/// features (e.g. because they are done infrequently), so minimizing their contribution to code
+/// size is more important.
+#[macro_export]
+macro_rules! dispatch_light128 {
+ ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
+ #[cfg(feature = "std")]
+ $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+ #[inline(always)]
+ fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+ use std::arch::x86_64::*;
+ #[target_feature(enable = "avx")]
+ unsafe fn impl_avx($($arg: $argty),*) -> $ret {
+ fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
+ }
+ #[target_feature(enable = "sse2")]
+ unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
+ fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
+ }
+ unsafe {
+ if is_x86_feature_detected!("avx") {
+ impl_avx($($arg),*)
+ } else if is_x86_feature_detected!("sse2") {
+ impl_sse2($($arg),*)
+ } else {
+ unimplemented!()
+ }
+ }
+ }
+ #[cfg(not(feature = "std"))]
+ #[inline(always)]
+ $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+ unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+ unsafe {
+ if cfg!(target_feature = "avx2") {
+ fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
+ } else if cfg!(target_feature = "avx") {
+ fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
+ } else if cfg!(target_feature = "sse4.1") {
+ fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
+ } else if cfg!(target_feature = "ssse3") {
+ fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
+ } else {
+ fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
+ }
+ }
+ }
+ };
+ ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
+ dispatch_light128!($mach, $MTy, {
+ $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
+ });
+ }
+}
+
+/// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit
+/// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2.
+///
+/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
+/// features (e.g. because they are done infrequently), so minimizing their contribution to code
+/// size is more important.
+#[macro_export]
+macro_rules! dispatch_light256 {
+ ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
+ #[cfg(feature = "std")]
+ $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret {
+ #[inline(always)]
+ fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+ use std::arch::x86_64::*;
+ #[target_feature(enable = "avx")]
+ unsafe fn impl_avx($($arg: $argty),*) -> $ret {
+ fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
+ }
+ #[target_feature(enable = "sse2")]
+ unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
+ fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
+ }
+ unsafe {
+ if is_x86_feature_detected!("avx") {
+ impl_avx($($arg),*)
+ } else if is_x86_feature_detected!("sse2") {
+ impl_sse2($($arg),*)
+ } else {
+ unimplemented!()
+ }
+ }
+ }
+ #[cfg(not(feature = "std"))]
+ #[inline(always)]
+ $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+ unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+ unsafe {
+ if cfg!(target_feature = "avx2") {
+ fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
+ } else if cfg!(target_feature = "avx") {
+ fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
+ } else if cfg!(target_feature = "sse4.1") {
+ fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
+ } else if cfg!(target_feature = "ssse3") {
+ fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
+ } else {
+ fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
+ }
+ }
+ }
+ };
+ ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
+ dispatch_light256!($mach, $MTy, {
+ $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
+ });
+ }
+}