diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-18 02:49:50 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-18 02:49:50 +0000 |
commit | 9835e2ae736235810b4ea1c162ca5e65c547e770 (patch) | |
tree | 3fcebf40ed70e581d776a8a4c65923e8ec20e026 /vendor/ppv-lite86/src | |
parent | Releasing progress-linux version 1.70.0+dfsg2-1~progress7.99u1. (diff) | |
download | rustc-9835e2ae736235810b4ea1c162ca5e65c547e770.tar.xz rustc-9835e2ae736235810b4ea1c162ca5e65c547e770.zip |
Merging upstream version 1.71.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/ppv-lite86/src')
-rw-r--r-- | vendor/ppv-lite86/src/generic.rs | 158 | ||||
-rw-r--r-- | vendor/ppv-lite86/src/lib.rs | 8 | ||||
-rw-r--r-- | vendor/ppv-lite86/src/soft.rs | 92 | ||||
-rw-r--r-- | vendor/ppv-lite86/src/types.rs | 321 | ||||
-rw-r--r-- | vendor/ppv-lite86/src/x86_64/mod.rs | 32 | ||||
-rw-r--r-- | vendor/ppv-lite86/src/x86_64/sse2.rs | 431 |
6 files changed, 611 insertions, 431 deletions
diff --git a/vendor/ppv-lite86/src/generic.rs b/vendor/ppv-lite86/src/generic.rs index 4f4113fc3..add6c4856 100644 --- a/vendor/ppv-lite86/src/generic.rs +++ b/vendor/ppv-lite86/src/generic.rs @@ -1,50 +1,50 @@ #![allow(non_camel_case_types)] -use core::ops::*; use crate::soft::{x2, x4}; use crate::types::*; +use core::ops::*; +#[repr(C)] #[derive(Clone, Copy)] pub union vec128_storage { d: [u32; 4], q: [u64; 2], - o: [u128; 1], } impl From<[u32; 4]> for vec128_storage { - #[inline] + #[inline(always)] fn from(d: [u32; 4]) -> Self { Self { d } } } impl From<vec128_storage> for [u32; 4] { - #[inline] + #[inline(always)] fn from(d: vec128_storage) -> Self { unsafe { d.d } } } impl From<[u64; 2]> for vec128_storage { - #[inline] + #[inline(always)] fn from(q: [u64; 2]) -> Self { Self { q } } } impl From<vec128_storage> for [u64; 2] { - #[inline] + #[inline(always)] fn from(q: vec128_storage) -> Self { unsafe { q.q } } } impl Default for vec128_storage { - #[inline] + #[inline(always)] fn default() -> Self { - Self { o: [0] } + Self { q: [0, 0] } } } impl Eq for vec128_storage {} impl PartialEq<vec128_storage> for vec128_storage { - #[inline] + #[inline(always)] fn eq(&self, rhs: &Self) -> bool { - unsafe { self.o == rhs.o } + unsafe { self.q == rhs.q } } } #[derive(Clone, Copy, PartialEq, Eq, Default)] @@ -61,20 +61,22 @@ impl vec256_storage { self.v128 } } -impl From<[u64; 4]> for vec256_storage { - #[inline] - fn from(q: [u64; 4]) -> Self { - Self { v128: [[0, 1].into(), [2, 3].into()] } - } -} impl From<vec256_storage> for [u64; 4] { - #[inline] + #[inline(always)] fn from(q: vec256_storage) -> Self { let [a, b]: [u64; 2] = q.v128[0].into(); let [c, d]: [u64; 2] = q.v128[1].into(); [a, b, c, d] } } +impl From<[u64; 4]> for vec256_storage { + #[inline(always)] + fn from([a, b, c, d]: [u64; 4]) -> Self { + Self { + v128: [[a, b].into(), [c, d].into()], + } + } +} #[derive(Clone, Copy, PartialEq, Eq, Default)] pub struct vec512_storage { v128: [vec128_storage; 4], @@ -90,6 +92,7 @@ impl vec512_storage { } } +#[inline(always)] fn dmap<T, F>(t: T, f: F) -> T where T: Store<vec128_storage> + Into<vec128_storage>, @@ -123,6 +126,7 @@ where unsafe { T::unpack(d) } } +#[inline(always)] fn qmap<T, F>(t: T, f: F) -> T where T: Store<vec128_storage> + Into<vec128_storage>, @@ -136,6 +140,7 @@ where unsafe { T::unpack(q) } } +#[inline(always)] fn qmap2<T, F>(a: T, b: T, f: F) -> T where T: Store<vec128_storage> + Into<vec128_storage>, @@ -151,17 +156,29 @@ where unsafe { T::unpack(q) } } +#[inline(always)] +fn o_of_q(q: [u64; 2]) -> u128 { + u128::from(q[0]) | (u128::from(q[1]) << 64) +} + +#[inline(always)] +fn q_of_o(o: u128) -> [u64; 2] { + [o as u64, (o >> 64) as u64] +} + +#[inline(always)] fn omap<T, F>(a: T, f: F) -> T where T: Store<vec128_storage> + Into<vec128_storage>, F: Fn(u128) -> u128, { let a: vec128_storage = a.into(); - let ao = unsafe { a.o }; - let o = vec128_storage { o: [f(ao[0])] }; + let ao = o_of_q(unsafe { a.q }); + let o = vec128_storage { q: q_of_o(f(ao)) }; unsafe { T::unpack(o) } } +#[inline(always)] fn omap2<T, F>(a: T, b: T, f: F) -> T where T: Store<vec128_storage> + Into<vec128_storage>, @@ -169,10 +186,10 @@ where { let a: vec128_storage = a.into(); let b: vec128_storage = b.into(); - let ao = unsafe { a.o }; - let bo = unsafe { b.o }; + let ao = o_of_q(unsafe { a.q }); + let bo = o_of_q(unsafe { b.q }); let o = vec128_storage { - o: [f(ao[0], bo[0])], + q: q_of_o(f(ao, bo)), }; unsafe { T::unpack(o) } } @@ -245,39 +262,39 @@ macro_rules! impl_bitops { } impl Swap64 for $vec { - #[inline] + #[inline(always)] fn swap1(self) -> Self { qmap(self, |x| { ((x & 0x5555555555555555) << 1) | ((x & 0xaaaaaaaaaaaaaaaa) >> 1) }) } - #[inline] + #[inline(always)] fn swap2(self) -> Self { qmap(self, |x| { ((x & 0x3333333333333333) << 2) | ((x & 0xcccccccccccccccc) >> 2) }) } - #[inline] + #[inline(always)] fn swap4(self) -> Self { qmap(self, |x| { ((x & 0x0f0f0f0f0f0f0f0f) << 4) | ((x & 0xf0f0f0f0f0f0f0f0) >> 4) }) } - #[inline] + #[inline(always)] fn swap8(self) -> Self { qmap(self, |x| { ((x & 0x00ff00ff00ff00ff) << 8) | ((x & 0xff00ff00ff00ff00) >> 8) }) } - #[inline] + #[inline(always)] fn swap16(self) -> Self { dmap(self, |x| x.rotate_left(16)) } - #[inline] + #[inline(always)] fn swap32(self) -> Self { qmap(self, |x| x.rotate_left(32)) } - #[inline] + #[inline(always)] fn swap64(self) -> Self { omap(self, |x| (x << 64) | (x >> 64)) } @@ -289,82 +306,83 @@ impl_bitops!(u64x2_generic); impl_bitops!(u128x1_generic); impl RotateEachWord32 for u32x4_generic { - #[inline] + #[inline(always)] fn rotate_each_word_right7(self) -> Self { dmap(self, |x| x.rotate_right(7)) } - #[inline] + #[inline(always)] fn rotate_each_word_right8(self) -> Self { dmap(self, |x| x.rotate_right(8)) } - #[inline] + #[inline(always)] fn rotate_each_word_right11(self) -> Self { dmap(self, |x| x.rotate_right(11)) } - #[inline] + #[inline(always)] fn rotate_each_word_right12(self) -> Self { dmap(self, |x| x.rotate_right(12)) } - #[inline] + #[inline(always)] fn rotate_each_word_right16(self) -> Self { dmap(self, |x| x.rotate_right(16)) } - #[inline] + #[inline(always)] fn rotate_each_word_right20(self) -> Self { dmap(self, |x| x.rotate_right(20)) } - #[inline] + #[inline(always)] fn rotate_each_word_right24(self) -> Self { dmap(self, |x| x.rotate_right(24)) } - #[inline] + #[inline(always)] fn rotate_each_word_right25(self) -> Self { dmap(self, |x| x.rotate_right(25)) } } impl RotateEachWord32 for u64x2_generic { - #[inline] + #[inline(always)] fn rotate_each_word_right7(self) -> Self { qmap(self, |x| x.rotate_right(7)) } - #[inline] + #[inline(always)] fn rotate_each_word_right8(self) -> Self { qmap(self, |x| x.rotate_right(8)) } - #[inline] + #[inline(always)] fn rotate_each_word_right11(self) -> Self { qmap(self, |x| x.rotate_right(11)) } - #[inline] + #[inline(always)] fn rotate_each_word_right12(self) -> Self { qmap(self, |x| x.rotate_right(12)) } - #[inline] + #[inline(always)] fn rotate_each_word_right16(self) -> Self { qmap(self, |x| x.rotate_right(16)) } - #[inline] + #[inline(always)] fn rotate_each_word_right20(self) -> Self { qmap(self, |x| x.rotate_right(20)) } - #[inline] + #[inline(always)] fn rotate_each_word_right24(self) -> Self { qmap(self, |x| x.rotate_right(24)) } - #[inline] + #[inline(always)] fn rotate_each_word_right25(self) -> Self { qmap(self, |x| x.rotate_right(25)) } } impl RotateEachWord64 for u64x2_generic { - #[inline] + #[inline(always)] fn rotate_each_word_right32(self) -> Self { qmap(self, |x| x.rotate_right(32)) } } // workaround for koute/cargo-web#52 (u128::rotate_* broken with cargo web) +#[inline(always)] fn rotate_u128_right(x: u128, i: u32) -> u128 { (x >> i) | (x << (128 - i)) } @@ -375,41 +393,41 @@ fn test_rotate_u128() { } impl RotateEachWord32 for u128x1_generic { - #[inline] + #[inline(always)] fn rotate_each_word_right7(self) -> Self { Self([rotate_u128_right(self.0[0], 7)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right8(self) -> Self { Self([rotate_u128_right(self.0[0], 8)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right11(self) -> Self { Self([rotate_u128_right(self.0[0], 11)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right12(self) -> Self { Self([rotate_u128_right(self.0[0], 12)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right16(self) -> Self { Self([rotate_u128_right(self.0[0], 16)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right20(self) -> Self { Self([rotate_u128_right(self.0[0], 20)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right24(self) -> Self { Self([rotate_u128_right(self.0[0], 24)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right25(self) -> Self { Self([rotate_u128_right(self.0[0], 25)]) } } impl RotateEachWord64 for u128x1_generic { - #[inline] + #[inline(always)] fn rotate_each_word_right32(self) -> Self { Self([rotate_u128_right(self.0[0], 32)]) } @@ -428,7 +446,7 @@ impl Machine for GenericMachine { type u32x4x4 = u32x4x4_generic; type u64x2x4 = u64x2x4_generic; type u128x4 = u128x4_generic; - #[inline] + #[inline(always)] unsafe fn instance() -> Self { Self } @@ -456,7 +474,7 @@ impl From<u64x2_generic> for vec128_storage { impl From<u128x1_generic> for vec128_storage { #[inline(always)] fn from(o: u128x1_generic) -> Self { - Self { o: o.0 } + Self { q: q_of_o(o.0[0]) } } } @@ -475,7 +493,7 @@ impl Store<vec128_storage> for u64x2_generic { impl Store<vec128_storage> for u128x1_generic { #[inline(always)] unsafe fn unpack(s: vec128_storage) -> Self { - Self(s.o) + Self([o_of_q(s.q); 1]) } } @@ -605,6 +623,22 @@ pub type u32x4x4_generic = x4<u32x4_generic>; pub type u64x2x4_generic = x4<u64x2_generic>; pub type u128x4_generic = x4<u128x1_generic>; +impl Vector<[u32; 16]> for u32x4x4_generic { + fn to_scalars(self) -> [u32; 16] { + let [a, b, c, d] = self.0; + let a = a.0; + let b = b.0; + let c = c.0; + let d = d.0; + [ + a[0], a[1], a[2], a[3], // + b[0], b[1], b[2], b[3], // + c[0], c[1], c[2], c[3], // + d[0], d[1], d[2], d[3], // + ] + } +} + impl MultiLane<[u32; 4]> for u32x4_generic { #[inline(always)] fn to_lanes(self) -> [u32; 4] { @@ -745,7 +779,7 @@ impl u128x4<GenericMachine> for u128x4_generic {} #[macro_export] macro_rules! dispatch { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { - #[inline] + #[inline(always)] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { let $mach = unsafe { $crate::generic::GenericMachine::instance() }; #[inline(always)] @@ -762,7 +796,7 @@ macro_rules! dispatch { #[macro_export] macro_rules! dispatch_light128 { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { - #[inline] + #[inline(always)] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { let $mach = unsafe { $crate::generic::GenericMachine::instance() }; #[inline(always)] @@ -779,7 +813,7 @@ macro_rules! dispatch_light128 { #[macro_export] macro_rules! dispatch_light256 { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { - #[inline] + #[inline(always)] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { let $mach = unsafe { $crate::generic::GenericMachine::instance() }; #[inline(always)] @@ -796,7 +830,7 @@ macro_rules! dispatch_light256 { #[macro_export] macro_rules! dispatch_light512 { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { - #[inline] + #[inline(always)] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { let $mach = unsafe { $crate::generic::GenericMachine::instance() }; #[inline(always)] diff --git a/vendor/ppv-lite86/src/lib.rs b/vendor/ppv-lite86/src/lib.rs index 43dc5d869..638552fc2 100644 --- a/vendor/ppv-lite86/src/lib.rs +++ b/vendor/ppv-lite86/src/lib.rs @@ -9,14 +9,14 @@ mod soft; mod types; pub use self::types::*; -#[cfg(all(feature = "simd", target_arch = "x86_64", not(miri)))] +#[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(feature = "no_simd"), not(miri)))] pub mod x86_64; -#[cfg(all(feature = "simd", target_arch = "x86_64", not(miri)))] +#[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(feature = "no_simd"), not(miri)))] use self::x86_64 as arch; -#[cfg(any(miri, not(all(feature = "simd", any(target_arch = "x86_64")))))] +#[cfg(any(feature = "no_simd", miri, not(target_arch = "x86_64"), all(target_arch = "x86_64", not(target_feature = "sse2"))))] pub mod generic; -#[cfg(any(miri, not(all(feature = "simd", any(target_arch = "x86_64")))))] +#[cfg(any(feature = "no_simd", miri, not(target_arch = "x86_64"), all(target_arch = "x86_64", not(target_feature = "sse2"))))] use self::generic as arch; pub use self::arch::{vec128_storage, vec256_storage, vec512_storage}; diff --git a/vendor/ppv-lite86/src/soft.rs b/vendor/ppv-lite86/src/soft.rs index d12dac528..0ae390c44 100644 --- a/vendor/ppv-lite86/src/soft.rs +++ b/vendor/ppv-lite86/src/soft.rs @@ -1,9 +1,9 @@ //! Implement 256- and 512- bit in terms of 128-bit, for machines without native wide SIMD. -use core::marker::PhantomData; -use core::ops::*; use crate::types::*; use crate::{vec128_storage, vec256_storage, vec512_storage}; +use core::marker::PhantomData; +use core::ops::*; #[derive(Copy, Clone, Default)] #[allow(non_camel_case_types)] @@ -175,26 +175,50 @@ impl<W: BSwap + Copy, G> BSwap for x2<W, G> { impl<W: StoreBytes + BSwap + Copy, G> StoreBytes for x2<W, G> { #[inline(always)] unsafe fn unsafe_read_le(input: &[u8]) -> Self { - let input = input.split_at(16); + let input = input.split_at(input.len() / 2); x2::new([W::unsafe_read_le(input.0), W::unsafe_read_le(input.1)]) } #[inline(always)] unsafe fn unsafe_read_be(input: &[u8]) -> Self { - x2::unsafe_read_le(input).bswap() + let input = input.split_at(input.len() / 2); + x2::new([W::unsafe_read_be(input.0), W::unsafe_read_be(input.1)]) } #[inline(always)] fn write_le(self, out: &mut [u8]) { - let out = out.split_at_mut(16); + let out = out.split_at_mut(out.len() / 2); self.0[0].write_le(out.0); self.0[1].write_le(out.1); } #[inline(always)] fn write_be(self, out: &mut [u8]) { - let out = out.split_at_mut(16); + let out = out.split_at_mut(out.len() / 2); self.0[0].write_be(out.0); self.0[1].write_be(out.1); } } +impl<W: Copy + LaneWords4, G: Copy> LaneWords4 for x2<W, G> { + #[inline(always)] + fn shuffle_lane_words2301(self) -> Self { + Self::new([ + self.0[0].shuffle_lane_words2301(), + self.0[1].shuffle_lane_words2301(), + ]) + } + #[inline(always)] + fn shuffle_lane_words1230(self) -> Self { + Self::new([ + self.0[0].shuffle_lane_words1230(), + self.0[1].shuffle_lane_words1230(), + ]) + } + #[inline(always)] + fn shuffle_lane_words3012(self) -> Self { + Self::new([ + self.0[0].shuffle_lane_words3012(), + self.0[1].shuffle_lane_words3012(), + ]) + } +} #[derive(Copy, Clone, Default)] #[allow(non_camel_case_types)] @@ -238,7 +262,12 @@ macro_rules! fwd_unop_x4 { ($fn:ident) => { #[inline(always)] fn $fn(self) -> Self { - x4([self.0[0].$fn(), self.0[1].$fn(), self.0[2].$fn(), self.0[3].$fn()]) + x4([ + self.0[0].$fn(), + self.0[1].$fn(), + self.0[2].$fn(), + self.0[3].$fn(), + ]) } }; } @@ -305,6 +334,20 @@ impl<W: Copy> Vec4<W> for x4<W> { self } } +impl<W: Copy> Vec4Ext<W> for x4<W> { + #[inline(always)] + fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) + where + Self: Sized, + { + ( + x4([a.0[0], b.0[0], c.0[0], d.0[0]]), + x4([a.0[1], b.0[1], c.0[1], d.0[1]]), + x4([a.0[2], b.0[2], c.0[2], d.0[2]]), + x4([a.0[3], b.0[3], c.0[3], d.0[3]]), + ) + } +} impl<W: Copy + Store<vec128_storage>> Store<vec512_storage> for x4<W> { #[inline(always)] unsafe fn unpack(p: vec512_storage) -> Self { @@ -363,30 +406,39 @@ impl<W: BSwap + Copy> BSwap for x4<W> { impl<W: StoreBytes + BSwap + Copy> StoreBytes for x4<W> { #[inline(always)] unsafe fn unsafe_read_le(input: &[u8]) -> Self { + let n = input.len() / 4; x4([ - W::unsafe_read_le(&input[0..16]), - W::unsafe_read_le(&input[16..32]), - W::unsafe_read_le(&input[32..48]), - W::unsafe_read_le(&input[48..64]), + W::unsafe_read_le(&input[..n]), + W::unsafe_read_le(&input[n..n * 2]), + W::unsafe_read_le(&input[n * 2..n * 3]), + W::unsafe_read_le(&input[n * 3..]), ]) } #[inline(always)] unsafe fn unsafe_read_be(input: &[u8]) -> Self { - x4::unsafe_read_le(input).bswap() + let n = input.len() / 4; + x4([ + W::unsafe_read_be(&input[..n]), + W::unsafe_read_be(&input[n..n * 2]), + W::unsafe_read_be(&input[n * 2..n * 3]), + W::unsafe_read_be(&input[n * 3..]), + ]) } #[inline(always)] fn write_le(self, out: &mut [u8]) { - self.0[0].write_le(&mut out[0..16]); - self.0[1].write_le(&mut out[16..32]); - self.0[2].write_le(&mut out[32..48]); - self.0[3].write_le(&mut out[48..64]); + let n = out.len() / 4; + self.0[0].write_le(&mut out[..n]); + self.0[1].write_le(&mut out[n..n * 2]); + self.0[2].write_le(&mut out[n * 2..n * 3]); + self.0[3].write_le(&mut out[n * 3..]); } #[inline(always)] fn write_be(self, out: &mut [u8]) { - self.0[0].write_be(&mut out[0..16]); - self.0[1].write_be(&mut out[16..32]); - self.0[2].write_be(&mut out[32..48]); - self.0[3].write_be(&mut out[48..64]); + let n = out.len() / 4; + self.0[0].write_be(&mut out[..n]); + self.0[1].write_be(&mut out[n..n * 2]); + self.0[2].write_be(&mut out[n * 2..n * 3]); + self.0[3].write_be(&mut out[n * 3..]); } } impl<W: Copy + LaneWords4> LaneWords4 for x4<W> { diff --git a/vendor/ppv-lite86/src/types.rs b/vendor/ppv-lite86/src/types.rs index 119b6bb8d..f9f3bf1ce 100644 --- a/vendor/ppv-lite86/src/types.rs +++ b/vendor/ppv-lite86/src/types.rs @@ -1,3 +1,4 @@ +#![allow(non_camel_case_types)] use core::ops::{Add, AddAssign, BitAnd, BitOr, BitXor, BitXorAssign, Not}; pub trait AndNot { @@ -44,182 +45,188 @@ pub trait RotateEachWord64 { pub trait RotateEachWord128 {} -#[allow(non_camel_case_types)] -mod types { - //! Vector type naming scheme: - //! uN[xP]xL - //! Unsigned; N-bit words * P bits per lane * L lanes - //! - //! A lane is always 128-bits, chosen because common SIMD architectures treat 128-bit units of - //! wide vectors specially (supporting e.g. intra-lane shuffles), and tend to have limited and - //! slow inter-lane operations. +// Vector type naming scheme: +// uN[xP]xL +// Unsigned; N-bit words * P bits per lane * L lanes +// +// A lane is always 128-bits, chosen because common SIMD architectures treat 128-bit units of +// wide vectors specially (supporting e.g. intra-lane shuffles), and tend to have limited and +// slow inter-lane operations. - use crate::arch::{vec128_storage, vec256_storage, vec512_storage}; - use crate::{ArithOps, BitOps128, BitOps32, BitOps64, Machine, Store, StoreBytes}; +use crate::arch::{vec128_storage, vec256_storage, vec512_storage}; - pub trait UnsafeFrom<T> { - unsafe fn unsafe_from(t: T) -> Self; - } +#[allow(clippy::missing_safety_doc)] +pub trait UnsafeFrom<T> { + unsafe fn unsafe_from(t: T) -> Self; +} - /// A vector composed of two elements, which may be words or themselves vectors. - pub trait Vec2<W> { - fn extract(self, i: u32) -> W; - fn insert(self, w: W, i: u32) -> Self; - } +/// A vector composed of two elements, which may be words or themselves vectors. +pub trait Vec2<W> { + fn extract(self, i: u32) -> W; + fn insert(self, w: W, i: u32) -> Self; +} - /// A vector composed of four elements, which may be words or themselves vectors. - pub trait Vec4<W> { - fn extract(self, i: u32) -> W; - fn insert(self, w: W, i: u32) -> Self; - } +/// A vector composed of four elements, which may be words or themselves vectors. +pub trait Vec4<W> { + fn extract(self, i: u32) -> W; + fn insert(self, w: W, i: u32) -> Self; +} +/// Vec4 functions which may not be implemented yet for all Vec4 types. +/// NOTE: functions in this trait may be moved to Vec4 in any patch release. To avoid breakage, +/// import Vec4Ext only together with Vec4, and don't qualify its methods. +pub trait Vec4Ext<W> { + fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) + where + Self: Sized; +} +pub trait Vector<T> { + fn to_scalars(self) -> T; +} - // TODO: multiples of 4 should inherit this - /// A vector composed of four words; depending on their size, operations may cross lanes. - pub trait Words4 { - fn shuffle1230(self) -> Self; - fn shuffle2301(self) -> Self; - fn shuffle3012(self) -> Self; - } +// TODO: multiples of 4 should inherit this +/// A vector composed of four words; depending on their size, operations may cross lanes. +pub trait Words4 { + fn shuffle1230(self) -> Self; + fn shuffle2301(self) -> Self; + fn shuffle3012(self) -> Self; +} - /// A vector composed one or more lanes each composed of four words. - pub trait LaneWords4 { - fn shuffle_lane_words1230(self) -> Self; - fn shuffle_lane_words2301(self) -> Self; - fn shuffle_lane_words3012(self) -> Self; - } +/// A vector composed one or more lanes each composed of four words. +pub trait LaneWords4 { + fn shuffle_lane_words1230(self) -> Self; + fn shuffle_lane_words2301(self) -> Self; + fn shuffle_lane_words3012(self) -> Self; +} - // TODO: make this a part of BitOps - /// Exchange neigboring ranges of bits of the specified size - pub trait Swap64 { - fn swap1(self) -> Self; - fn swap2(self) -> Self; - fn swap4(self) -> Self; - fn swap8(self) -> Self; - fn swap16(self) -> Self; - fn swap32(self) -> Self; - fn swap64(self) -> Self; - } +// TODO: make this a part of BitOps +/// Exchange neigboring ranges of bits of the specified size +pub trait Swap64 { + fn swap1(self) -> Self; + fn swap2(self) -> Self; + fn swap4(self) -> Self; + fn swap8(self) -> Self; + fn swap16(self) -> Self; + fn swap32(self) -> Self; + fn swap64(self) -> Self; +} - pub trait u32x4<M: Machine>: - BitOps32 - + Store<vec128_storage> - + ArithOps - + Vec4<u32> - + Words4 - + LaneWords4 - + StoreBytes - + MultiLane<[u32; 4]> - + Into<vec128_storage> - { +pub trait u32x4<M: Machine>: + BitOps32 + + Store<vec128_storage> + + ArithOps + + Vec4<u32> + + Words4 + + LaneWords4 + + StoreBytes + + MultiLane<[u32; 4]> + + Into<vec128_storage> +{ } - pub trait u64x2<M: Machine>: - BitOps64 - + Store<vec128_storage> - + ArithOps - + Vec2<u64> - + MultiLane<[u64; 2]> - + Into<vec128_storage> - { +pub trait u64x2<M: Machine>: + BitOps64 + Store<vec128_storage> + ArithOps + Vec2<u64> + MultiLane<[u64; 2]> + Into<vec128_storage> +{ } - pub trait u128x1<M: Machine>: - BitOps128 + Store<vec128_storage> + Swap64 + MultiLane<[u128; 1]> + Into<vec128_storage> - { +pub trait u128x1<M: Machine>: + BitOps128 + Store<vec128_storage> + Swap64 + MultiLane<[u128; 1]> + Into<vec128_storage> +{ } - pub trait u32x4x2<M: Machine>: - BitOps32 - + Store<vec256_storage> - + Vec2<M::u32x4> - + MultiLane<[M::u32x4; 2]> - + ArithOps - + Into<vec256_storage> - { +pub trait u32x4x2<M: Machine>: + BitOps32 + + Store<vec256_storage> + + Vec2<M::u32x4> + + MultiLane<[M::u32x4; 2]> + + ArithOps + + Into<vec256_storage> + + StoreBytes +{ } - pub trait u64x2x2<M: Machine>: - BitOps64 - + Store<vec256_storage> - + Vec2<M::u64x2> - + MultiLane<[M::u64x2; 2]> - + ArithOps - + StoreBytes - + Into<vec256_storage> - { +pub trait u64x2x2<M: Machine>: + BitOps64 + + Store<vec256_storage> + + Vec2<M::u64x2> + + MultiLane<[M::u64x2; 2]> + + ArithOps + + StoreBytes + + Into<vec256_storage> +{ } - pub trait u64x4<M: Machine>: - BitOps64 - + Store<vec256_storage> - + Vec4<u64> - + MultiLane<[u64; 4]> - + ArithOps - + Words4 - + StoreBytes - + Into<vec256_storage> - { +pub trait u64x4<M: Machine>: + BitOps64 + + Store<vec256_storage> + + Vec4<u64> + + MultiLane<[u64; 4]> + + ArithOps + + Words4 + + StoreBytes + + Into<vec256_storage> +{ } - pub trait u128x2<M: Machine>: - BitOps128 - + Store<vec256_storage> - + Vec2<M::u128x1> - + MultiLane<[M::u128x1; 2]> - + Swap64 - + Into<vec256_storage> - { +pub trait u128x2<M: Machine>: + BitOps128 + + Store<vec256_storage> + + Vec2<M::u128x1> + + MultiLane<[M::u128x1; 2]> + + Swap64 + + Into<vec256_storage> +{ } - pub trait u32x4x4<M: Machine>: - BitOps32 - + Store<vec512_storage> - + Vec4<M::u32x4> - + MultiLane<[M::u32x4; 4]> - + ArithOps - + LaneWords4 - + Into<vec512_storage> - { +pub trait u32x4x4<M: Machine>: + BitOps32 + + Store<vec512_storage> + + Vec4<M::u32x4> + + Vec4Ext<M::u32x4> + + Vector<[u32; 16]> + + MultiLane<[M::u32x4; 4]> + + ArithOps + + LaneWords4 + + Into<vec512_storage> + + StoreBytes +{ } - pub trait u64x2x4<M: Machine>: - BitOps64 - + Store<vec512_storage> - + Vec4<M::u64x2> - + MultiLane<[M::u64x2; 4]> - + ArithOps - + Into<vec512_storage> - { +pub trait u64x2x4<M: Machine>: + BitOps64 + + Store<vec512_storage> + + Vec4<M::u64x2> + + MultiLane<[M::u64x2; 4]> + + ArithOps + + Into<vec512_storage> +{ } - // TODO: Words4 - pub trait u128x4<M: Machine>: - BitOps128 - + Store<vec512_storage> - + Vec4<M::u128x1> - + MultiLane<[M::u128x1; 4]> - + Swap64 - + Into<vec512_storage> - { +// TODO: Words4 +pub trait u128x4<M: Machine>: + BitOps128 + + Store<vec512_storage> + + Vec4<M::u128x1> + + MultiLane<[M::u128x1; 4]> + + Swap64 + + Into<vec512_storage> +{ } - /// A vector composed of multiple 128-bit lanes. - pub trait MultiLane<Lanes> { - /// Split a multi-lane vector into single-lane vectors. - fn to_lanes(self) -> Lanes; - /// Build a multi-lane vector from individual lanes. - fn from_lanes(lanes: Lanes) -> Self; - } +/// A vector composed of multiple 128-bit lanes. +pub trait MultiLane<Lanes> { + /// Split a multi-lane vector into single-lane vectors. + fn to_lanes(self) -> Lanes; + /// Build a multi-lane vector from individual lanes. + fn from_lanes(lanes: Lanes) -> Self; +} - /// Combine single vectors into a multi-lane vector. - pub trait VZip<V> { - fn vzip(self) -> V; - } +/// Combine single vectors into a multi-lane vector. +pub trait VZip<V> { + fn vzip(self) -> V; +} - impl<V, T> VZip<V> for T - where - V: MultiLane<T>, - { - #[inline(always)] - fn vzip(self) -> V { - V::from_lanes(self) - } +impl<V, T> VZip<V> for T +where + V: MultiLane<T>, +{ + #[inline(always)] + fn vzip(self) -> V { + V::from_lanes(self) } } -pub use self::types::*; pub trait Machine: Sized + Copy { type u32x4: u32x4<Self>; @@ -264,15 +271,27 @@ pub trait Machine: Sized + Copy { unsafe { V::unsafe_read_be(input) } } + /// # Safety + /// Caller must ensure the type of Self is appropriate for the hardware of the execution + /// environment. unsafe fn instance() -> Self; } pub trait Store<S> { + /// # Safety + /// Caller must ensure the type of Self is appropriate for the hardware of the execution + /// environment. unsafe fn unpack(p: S) -> Self; } pub trait StoreBytes { + /// # Safety + /// Caller must ensure the type of Self is appropriate for the hardware of the execution + /// environment. unsafe fn unsafe_read_le(input: &[u8]) -> Self; + /// # Safety + /// Caller must ensure the type of Self is appropriate for the hardware of the execution + /// environment. unsafe fn unsafe_read_be(input: &[u8]) -> Self; fn write_le(self, out: &mut [u8]); fn write_be(self, out: &mut [u8]); diff --git a/vendor/ppv-lite86/src/x86_64/mod.rs b/vendor/ppv-lite86/src/x86_64/mod.rs index ecf184f36..937732da3 100644 --- a/vendor/ppv-lite86/src/x86_64/mod.rs +++ b/vendor/ppv-lite86/src/x86_64/mod.rs @@ -1,7 +1,7 @@ // crate minimums: sse2, x86_64 -use core::arch::x86_64::{__m128i, __m256i}; use crate::types::*; +use core::arch::x86_64::{__m128i, __m256i}; mod sse2; @@ -79,7 +79,7 @@ where type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>; type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>; - type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>; + type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>; type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>; type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>; type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>; @@ -119,16 +119,16 @@ impl Store<vec128_storage> for vec128_storage { p } } -impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage { +impl<'a> From<&'a vec128_storage> for &'a [u32; 4] { #[inline(always)] - fn into(self) -> &'a [u32; 4] { - unsafe { &self.u32x4 } + fn from(x: &'a vec128_storage) -> Self { + unsafe { &x.u32x4 } } } -impl Into<vec128_storage> for [u32; 4] { +impl From<[u32; 4]> for vec128_storage { #[inline(always)] - fn into(self) -> vec128_storage { - vec128_storage { u32x4: self } + fn from(u32x4: [u32; 4]) -> Self { + vec128_storage { u32x4 } } } impl Default for vec128_storage { @@ -154,10 +154,10 @@ pub union vec256_storage { sse2: [vec128_storage; 2], avx: __m256i, } -impl Into<vec256_storage> for [u64; 4] { +impl From<[u64; 4]> for vec256_storage { #[inline(always)] - fn into(self) -> vec256_storage { - vec256_storage { u64x4: self } + fn from(u64x4: [u64; 4]) -> Self { + vec256_storage { u64x4 } } } impl Default for vec256_storage { @@ -167,9 +167,11 @@ impl Default for vec256_storage { } } impl vec256_storage { + #[inline(always)] pub fn new128(xs: [vec128_storage; 2]) -> Self { Self { sse2: xs } } + #[inline(always)] pub fn split128(self) -> [vec128_storage; 2] { unsafe { self.sse2 } } @@ -200,9 +202,11 @@ impl Default for vec512_storage { } } impl vec512_storage { + #[inline(always)] pub fn new128(xs: [vec128_storage; 4]) -> Self { Self { sse2: xs } } + #[inline(always)] pub fn split128(self) -> [vec128_storage; 4] { unsafe { self.sse2 } } @@ -217,10 +221,10 @@ impl PartialEq for vec512_storage { macro_rules! impl_into { ($storage:ident, $array:ty, $name:ident) => { - impl Into<$array> for $storage { + impl From<$storage> for $array { #[inline(always)] - fn into(self) -> $array { - unsafe { self.$name } + fn from(vec: $storage) -> Self { + unsafe { vec.$name } } } }; diff --git a/vendor/ppv-lite86/src/x86_64/sse2.rs b/vendor/ppv-lite86/src/x86_64/sse2.rs index 60e7681c3..97197a436 100644 --- a/vendor/ppv-lite86/src/x86_64/sse2.rs +++ b/vendor/ppv-lite86/src/x86_64/sse2.rs @@ -166,49 +166,44 @@ macro_rules! impl_bitops128 { macro_rules! rotr_32_s3 { ($name:ident, $k0:expr, $k1:expr) => { - #[inline(always)] - fn $name(self) -> Self { - Self::new(unsafe { - _mm_shuffle_epi8( - self.x, - _mm_set_epi64x($k0, $k1), - ) - }) + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) }) } }; } macro_rules! rotr_32 { ($name:ident, $i:expr) => { - #[inline(always)] - fn $name(self) -> Self { - Self::new(unsafe { - _mm_or_si128( - _mm_srli_epi32(self.x, $i as i32), - _mm_slli_epi32(self.x, 32 - $i as i32), - ) - }) - } + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + _mm_or_si128( + _mm_srli_epi32(self.x, $i as i32), + _mm_slli_epi32(self.x, 32 - $i as i32), + ) + }) + } }; } impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> { rotr_32!(rotate_each_word_right7, 7); rotr_32_s3!( rotate_each_word_right8, - 0x0c0f0e0d_080b0a09, - 0x04070605_00030201 + 0x0c0f_0e0d_080b_0a09, + 0x0407_0605_0003_0201 ); rotr_32!(rotate_each_word_right11, 11); rotr_32!(rotate_each_word_right12, 12); rotr_32_s3!( rotate_each_word_right16, - 0x0d0c0f0e_09080b0a, - 0x05040706_01000302 + 0x0d0c_0f0e_0908_0b0a, + 0x0504_0706_0100_0302 ); rotr_32!(rotate_each_word_right20, 20); rotr_32_s3!( rotate_each_word_right24, - 0x0e0d0c0f_0a09080b, - 0x06050407_02010003 + 0x0e0d_0c0f_0a09_080b, + 0x0605_0407_0201_0003 ); rotr_32!(rotate_each_word_right25, 25); } @@ -228,28 +223,23 @@ impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> { macro_rules! rotr_64_s3 { ($name:ident, $k0:expr, $k1:expr) => { - #[inline(always)] - fn $name(self) -> Self { - Self::new(unsafe { - _mm_shuffle_epi8( - self.x, - _mm_set_epi64x($k0, $k1), - ) - }) + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) }) } }; } macro_rules! rotr_64 { ($name:ident, $i:expr) => { - #[inline(always)] - fn $name(self) -> Self { - Self::new(unsafe { - _mm_or_si128( - _mm_srli_epi64(self.x, $i as i32), - _mm_slli_epi64(self.x, 64 - $i as i32), - ) - }) - } + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + _mm_or_si128( + _mm_srli_epi64(self.x, $i as i32), + _mm_slli_epi64(self.x, 64 - $i as i32), + ) + }) + } }; } impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> { @@ -296,15 +286,15 @@ impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> { macro_rules! rotr_128 { ($name:ident, $i:expr) => { - #[inline(always)] - fn $name(self) -> Self { - Self::new(unsafe { - _mm_or_si128( - _mm_srli_si128(self.x, $i as i32), - _mm_slli_si128(self.x, 128 - $i as i32), - ) - }) - } + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + _mm_or_si128( + _mm_srli_si128(self.x, $i as i32), + _mm_slli_si128(self.x, 128 - $i as i32), + ) + }) + } }; } // TODO: completely unoptimized @@ -411,7 +401,7 @@ impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> { } #[inline(always)] fn from_lanes(xs: [u128; 1]) -> Self { - unimplemented!() + unimplemented!("{:?}", xs) } } @@ -780,7 +770,7 @@ impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> { impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> { #[inline(always)] fn bswap(self) -> Self { - Self::new(unsafe { unimplemented!() }) + unimplemented!() } } @@ -890,6 +880,13 @@ pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>; #[allow(non_camel_case_types)] pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>; +impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> { + #[inline(always)] + fn to_scalars(self) -> [u32; 16] { + unsafe { core::mem::transmute(self) } + } +} + impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI> where u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap, @@ -993,6 +990,8 @@ where Machine86<S3, S4, NI>: Machine, u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>, u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>, + u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>, + u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>, { } impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI> @@ -1014,14 +1013,6 @@ where { } -impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_sse2<YesS3, YesS4, NI> -where - u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap, - Avx2Machine<NI>: Machine, - u32x4x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 4]>, - u32x4x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u32x4>, -{ -} impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI> where u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, @@ -1078,6 +1069,7 @@ impl<W: PartialEq, G> PartialEq for x2<W, G> { } } +#[allow(unused)] #[inline(always)] unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool { let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110); @@ -1383,65 +1375,78 @@ mod test { pub mod avx2 { #![allow(non_camel_case_types)] - use crate::soft::x4; + use crate::soft::{x2, x4}; use crate::types::*; - use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2}; + use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0}; use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4}; use core::arch::x86_64::*; use core::marker::PhantomData; use core::ops::*; #[derive(Copy, Clone)] - pub struct u32x4x4_avx2<NI> { - x: [__m256i; 2], + pub struct u32x4x2_avx2<NI> { + x: __m256i, ni: PhantomData<NI>, } - impl<NI> u32x4x4_avx2<NI> { + impl<NI> u32x4x2_avx2<NI> { #[inline(always)] - fn new(x: [__m256i; 2]) -> Self { + fn new(x: __m256i) -> Self { Self { x, ni: PhantomData } } } - impl<NI> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> where NI: Copy {} - impl<NI> Store<vec512_storage> for u32x4x4_avx2<NI> { + impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {} + impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> { #[inline(always)] - unsafe fn unpack(p: vec512_storage) -> Self { - Self::new([p.avx[0].avx, p.avx[1].avx]) + unsafe fn unpack(p: vec256_storage) -> Self { + Self::new(p.avx) } } - impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> { + impl<NI> StoreBytes for u32x4x2_avx2<NI> { #[inline(always)] - fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] { + unsafe fn unsafe_read_le(input: &[u8]) -> Self { + assert_eq!(input.len(), 32); + Self::new(_mm256_loadu_si256(input.as_ptr() as *const _)) + } + #[inline(always)] + unsafe fn unsafe_read_be(input: &[u8]) -> Self { + Self::unsafe_read_le(input).bswap() + } + #[inline(always)] + fn write_le(self, out: &mut [u8]) { unsafe { - [ - u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)), - u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)), - u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)), - u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)), - ] + assert_eq!(out.len(), 32); + _mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x) } } #[inline(always)] - fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self { - Self::new(unsafe { + fn write_be(self, out: &mut [u8]) { + self.bswap().write_le(out) + } + } + impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 2]> for u32x4x2_avx2<NI> { + #[inline(always)] + fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 2] { + unsafe { [ - _mm256_setr_m128i(x[0].x, x[1].x), - _mm256_setr_m128i(x[2].x, x[3].x), + u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)), + u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)), ] - }) + } + } + #[inline(always)] + fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 2]) -> Self { + Self::new(unsafe { _mm256_setr_m128i(x[0].x, x[1].x) }) } } - impl<NI> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> { + impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> { #[inline(always)] fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> { unsafe { match i { - 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)), - 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)), - 2 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)), - 3 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)), + 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)), + 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)), _ => panic!(), } } @@ -1450,61 +1455,21 @@ pub mod avx2 { fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self { Self::new(unsafe { match i { - 0 => [_mm256_inserti128_si256(self.x[0], w.x, 0), self.x[1]], - 1 => [_mm256_inserti128_si256(self.x[0], w.x, 1), self.x[1]], - 2 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 0)], - 3 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 1)], + 0 => _mm256_inserti128_si256(self.x, w.x, 0), + 1 => _mm256_inserti128_si256(self.x, w.x, 1), _ => panic!(), } }) } } - impl<NI> LaneWords4 for u32x4x4_avx2<NI> { - #[inline(always)] - fn shuffle_lane_words1230(self) -> Self { - Self::new(unsafe { - [ - _mm256_shuffle_epi32(self.x[0], 0b1001_0011), - _mm256_shuffle_epi32(self.x[1], 0b1001_0011), - ] - }) - } - #[inline(always)] - fn shuffle_lane_words2301(self) -> Self { - Self::new(unsafe { - [ - _mm256_shuffle_epi32(self.x[0], 0b0100_1110), - _mm256_shuffle_epi32(self.x[1], 0b0100_1110), - ] - }) - } - #[inline(always)] - fn shuffle_lane_words3012(self) -> Self { - Self::new(unsafe { - [ - _mm256_shuffle_epi32(self.x[0], 0b0011_1001), - _mm256_shuffle_epi32(self.x[1], 0b0011_1001), - ] - }) - } - } - impl<NI> BitOps32 for u32x4x4_avx2<NI> where NI: Copy {} - impl<NI> ArithOps for u32x4x4_avx2<NI> where NI: Copy {} + impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {} + impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {} macro_rules! shuf_lane_bytes { ($name:ident, $k0:expr, $k1:expr) => { - #[inline(always)] - fn $name(self) -> Self { - Self::new(unsafe { - [ - _mm256_shuffle_epi8( - self.x[0], - _mm256_set_epi64x($k0, $k1, $k0, $k1), - ), - _mm256_shuffle_epi8( - self.x[1], - _mm256_set_epi64x($k0, $k1, $k0, $k1), - ) - ] + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + _mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1)) }) } }; @@ -1514,52 +1479,41 @@ pub mod avx2 { #[inline(always)] fn $name(self) -> Self { Self::new(unsafe { - [ - _mm256_or_si256( - _mm256_srli_epi32(self.x[0], $i as i32), - _mm256_slli_epi32(self.x[0], 32 - $i as i32), - ), - _mm256_or_si256( - _mm256_srli_epi32(self.x[1], $i as i32), - _mm256_slli_epi32(self.x[1], 32 - $i as i32), - ) - ] + _mm256_or_si256( + _mm256_srli_epi32(self.x, $i as i32), + _mm256_slli_epi32(self.x, 32 - $i as i32), + ) }) } }; } - impl<NI: Copy> RotateEachWord32 for u32x4x4_avx2<NI> { + impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> { rotr_32!(rotate_each_word_right7, 7); shuf_lane_bytes!( rotate_each_word_right8, - 0x0c0f0e0d_080b0a09, - 0x04070605_00030201 + 0x0c0f_0e0d_080b_0a09, + 0x0407_0605_0003_0201 ); rotr_32!(rotate_each_word_right11, 11); rotr_32!(rotate_each_word_right12, 12); shuf_lane_bytes!( rotate_each_word_right16, - 0x0d0c0f0e_09080b0a, - 0x05040706_01000302 + 0x0d0c_0f0e_0908_0b0a, + 0x0504_0706_0100_0302 ); rotr_32!(rotate_each_word_right20, 20); shuf_lane_bytes!( rotate_each_word_right24, - 0x0e0d0c0f_0a09080b, - 0x06050407_02010003 + 0x0e0d_0c0f_0a09_080b, + 0x0605_0407_0201_0003 ); rotr_32!(rotate_each_word_right25, 25); } - impl<NI> BitOps0 for u32x4x4_avx2<NI> where NI: Copy {} - impl<NI> From<u32x4x4_avx2<NI>> for vec512_storage { + impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {} + impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage { #[inline(always)] - fn from(x: u32x4x4_avx2<NI>) -> Self { - Self { - avx: [ - vec256_storage { avx: x.x[0] }, - vec256_storage { avx: x.x[1] }, - ], - } + fn from(x: u32x4x2_avx2<NI>) -> Self { + Self { avx: x.x } } } @@ -1576,55 +1530,172 @@ pub mod avx2 { } }; } - impl_assign!(u32x4x4_avx2, BitXorAssign, bitxor_assign, bitxor); - impl_assign!(u32x4x4_avx2, BitOrAssign, bitor_assign, bitor); - impl_assign!(u32x4x4_avx2, BitAndAssign, bitand_assign, bitand); - impl_assign!(u32x4x4_avx2, AddAssign, add_assign, add); + impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor); + impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor); + impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand); + impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add); - macro_rules! impl_bitop_x2 { + macro_rules! impl_bitop { ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => { impl<NI> $Op for $vec<NI> { type Output = Self; #[inline(always)] fn $op_fn(self, rhs: Self) -> Self::Output { - Self::new(unsafe { - [$impl_fn(self.x[0], rhs.x[0]), $impl_fn(self.x[1], rhs.x[1])] - }) + Self::new(unsafe { $impl_fn(self.x, rhs.x) }) } } }; } - impl_bitop_x2!(u32x4x4_avx2, BitXor, bitxor, _mm256_xor_si256); - impl_bitop_x2!(u32x4x4_avx2, BitOr, bitor, _mm256_or_si256); - impl_bitop_x2!(u32x4x4_avx2, BitAnd, bitand, _mm256_and_si256); - impl_bitop_x2!(u32x4x4_avx2, AndNot, andnot, _mm256_andnot_si256); - impl_bitop_x2!(u32x4x4_avx2, Add, add, _mm256_add_epi32); + impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256); + impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256); + impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256); + impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256); + impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32); - impl<NI> Not for u32x4x4_avx2<NI> { + impl<NI> Not for u32x4x2_avx2<NI> { type Output = Self; #[inline(always)] fn not(self) -> Self::Output { unsafe { let f = _mm256_set1_epi8(-0x7f); - Self::new([f, f]) ^ self + Self::new(f) ^ self } } } - impl<NI> BSwap for u32x4x4_avx2<NI> { + impl<NI> BSwap for u32x4x2_avx2<NI> { shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203); } - impl<NI> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> + impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI> where NI: Copy, { #[inline(always)] + fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self { + Self::new(unsafe { _mm256_setr_m128i(x.0[0].x, x.0[1].x) }) + } + } + + impl<NI> LaneWords4 for u32x4x2_avx2<NI> { + #[inline(always)] + fn shuffle_lane_words1230(self) -> Self { + Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b1001_0011) }) + } + #[inline(always)] + fn shuffle_lane_words2301(self) -> Self { + Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0100_1110) }) + } + #[inline(always)] + fn shuffle_lane_words3012(self) -> Self { + Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0011_1001) }) + } + } + + /////////////////////////////////////////////////////////////////////////////////////////// + + pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>; + impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {} + + impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> { + #[inline(always)] + unsafe fn unpack(p: vec512_storage) -> Self { + Self::new([ + u32x4x2_avx2::unpack(p.avx[0]), + u32x4x2_avx2::unpack(p.avx[1]), + ]) + } + } + impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> { + #[inline(always)] + fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] { + let [a, b] = self.0[0].to_lanes(); + let [c, d] = self.0[1].to_lanes(); + [a, b, c, d] + } + #[inline(always)] + fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self { + let ab = u32x4x2_avx2::from_lanes([x[0], x[1]]); + let cd = u32x4x2_avx2::from_lanes([x[2], x[3]]); + Self::new([ab, cd]) + } + } + impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> { + #[inline(always)] + fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> { + match i { + 0 => self.0[0].extract(0), + 1 => self.0[0].extract(1), + 2 => self.0[1].extract(0), + 3 => self.0[1].extract(1), + _ => panic!(), + } + } + #[inline(always)] + fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self { + Self::new(match i { + 0 | 1 => [self.0[0].insert(w, i), self.0[1]], + 2 | 3 => [self.0[0], self.0[1].insert(w, i - 2)], + _ => panic!(), + }) + } + } + impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> { + #[inline(always)] + fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) { + /* + * a00:a01 a10:a11 + * b00:b01 b10:b11 + * c00:c01 c10:c11 + * d00:d01 d10:d11 + * => + * a00:b00 c00:d00 + * a01:b01 c01:d01 + * a10:b10 c10:d10 + * a11:b11 c11:d11 + */ + unsafe { + let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x20)); + let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x31)); + let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x20)); + let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x31)); + let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x20)); + let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x31)); + let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x20)); + let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x31)); + ( + Self::new([ab00, cd00]), + Self::new([ab01, cd01]), + Self::new([ab10, cd10]), + Self::new([ab11, cd11]), + ) + } + } + } + impl<NI: Copy> Vector<[u32; 16]> for u32x4x4_avx2<NI> { + #[inline(always)] + fn to_scalars(self) -> [u32; 16] { + unsafe { core::mem::transmute(self) } + } + } + impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage { + #[inline(always)] + fn from(x: u32x4x4_avx2<NI>) -> Self { + Self { + avx: [ + vec256_storage { avx: x.0[0].x }, + vec256_storage { avx: x.0[1].x }, + ], + } + } + } + impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> { + #[inline(always)] fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self { Self::new(unsafe { [ - _mm256_setr_m128i(x.0[0].x, x.0[1].x), - _mm256_setr_m128i(x.0[2].x, x.0[3].x), + u32x4x2_avx2::new(_mm256_setr_m128i(x.0[0].x, x.0[1].x)), + u32x4x2_avx2::new(_mm256_setr_m128i(x.0[2].x, x.0[3].x)), ] }) } |