summaryrefslogtreecommitdiffstats
path: root/vendor/ppv-lite86/src
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/ppv-lite86/src')
-rw-r--r--vendor/ppv-lite86/src/generic.rs158
-rw-r--r--vendor/ppv-lite86/src/lib.rs8
-rw-r--r--vendor/ppv-lite86/src/soft.rs92
-rw-r--r--vendor/ppv-lite86/src/types.rs321
-rw-r--r--vendor/ppv-lite86/src/x86_64/mod.rs32
-rw-r--r--vendor/ppv-lite86/src/x86_64/sse2.rs431
6 files changed, 611 insertions, 431 deletions
diff --git a/vendor/ppv-lite86/src/generic.rs b/vendor/ppv-lite86/src/generic.rs
index 4f4113fc3..add6c4856 100644
--- a/vendor/ppv-lite86/src/generic.rs
+++ b/vendor/ppv-lite86/src/generic.rs
@@ -1,50 +1,50 @@
#![allow(non_camel_case_types)]
-use core::ops::*;
use crate::soft::{x2, x4};
use crate::types::*;
+use core::ops::*;
+#[repr(C)]
#[derive(Clone, Copy)]
pub union vec128_storage {
d: [u32; 4],
q: [u64; 2],
- o: [u128; 1],
}
impl From<[u32; 4]> for vec128_storage {
- #[inline]
+ #[inline(always)]
fn from(d: [u32; 4]) -> Self {
Self { d }
}
}
impl From<vec128_storage> for [u32; 4] {
- #[inline]
+ #[inline(always)]
fn from(d: vec128_storage) -> Self {
unsafe { d.d }
}
}
impl From<[u64; 2]> for vec128_storage {
- #[inline]
+ #[inline(always)]
fn from(q: [u64; 2]) -> Self {
Self { q }
}
}
impl From<vec128_storage> for [u64; 2] {
- #[inline]
+ #[inline(always)]
fn from(q: vec128_storage) -> Self {
unsafe { q.q }
}
}
impl Default for vec128_storage {
- #[inline]
+ #[inline(always)]
fn default() -> Self {
- Self { o: [0] }
+ Self { q: [0, 0] }
}
}
impl Eq for vec128_storage {}
impl PartialEq<vec128_storage> for vec128_storage {
- #[inline]
+ #[inline(always)]
fn eq(&self, rhs: &Self) -> bool {
- unsafe { self.o == rhs.o }
+ unsafe { self.q == rhs.q }
}
}
#[derive(Clone, Copy, PartialEq, Eq, Default)]
@@ -61,20 +61,22 @@ impl vec256_storage {
self.v128
}
}
-impl From<[u64; 4]> for vec256_storage {
- #[inline]
- fn from(q: [u64; 4]) -> Self {
- Self { v128: [[0, 1].into(), [2, 3].into()] }
- }
-}
impl From<vec256_storage> for [u64; 4] {
- #[inline]
+ #[inline(always)]
fn from(q: vec256_storage) -> Self {
let [a, b]: [u64; 2] = q.v128[0].into();
let [c, d]: [u64; 2] = q.v128[1].into();
[a, b, c, d]
}
}
+impl From<[u64; 4]> for vec256_storage {
+ #[inline(always)]
+ fn from([a, b, c, d]: [u64; 4]) -> Self {
+ Self {
+ v128: [[a, b].into(), [c, d].into()],
+ }
+ }
+}
#[derive(Clone, Copy, PartialEq, Eq, Default)]
pub struct vec512_storage {
v128: [vec128_storage; 4],
@@ -90,6 +92,7 @@ impl vec512_storage {
}
}
+#[inline(always)]
fn dmap<T, F>(t: T, f: F) -> T
where
T: Store<vec128_storage> + Into<vec128_storage>,
@@ -123,6 +126,7 @@ where
unsafe { T::unpack(d) }
}
+#[inline(always)]
fn qmap<T, F>(t: T, f: F) -> T
where
T: Store<vec128_storage> + Into<vec128_storage>,
@@ -136,6 +140,7 @@ where
unsafe { T::unpack(q) }
}
+#[inline(always)]
fn qmap2<T, F>(a: T, b: T, f: F) -> T
where
T: Store<vec128_storage> + Into<vec128_storage>,
@@ -151,17 +156,29 @@ where
unsafe { T::unpack(q) }
}
+#[inline(always)]
+fn o_of_q(q: [u64; 2]) -> u128 {
+ u128::from(q[0]) | (u128::from(q[1]) << 64)
+}
+
+#[inline(always)]
+fn q_of_o(o: u128) -> [u64; 2] {
+ [o as u64, (o >> 64) as u64]
+}
+
+#[inline(always)]
fn omap<T, F>(a: T, f: F) -> T
where
T: Store<vec128_storage> + Into<vec128_storage>,
F: Fn(u128) -> u128,
{
let a: vec128_storage = a.into();
- let ao = unsafe { a.o };
- let o = vec128_storage { o: [f(ao[0])] };
+ let ao = o_of_q(unsafe { a.q });
+ let o = vec128_storage { q: q_of_o(f(ao)) };
unsafe { T::unpack(o) }
}
+#[inline(always)]
fn omap2<T, F>(a: T, b: T, f: F) -> T
where
T: Store<vec128_storage> + Into<vec128_storage>,
@@ -169,10 +186,10 @@ where
{
let a: vec128_storage = a.into();
let b: vec128_storage = b.into();
- let ao = unsafe { a.o };
- let bo = unsafe { b.o };
+ let ao = o_of_q(unsafe { a.q });
+ let bo = o_of_q(unsafe { b.q });
let o = vec128_storage {
- o: [f(ao[0], bo[0])],
+ q: q_of_o(f(ao, bo)),
};
unsafe { T::unpack(o) }
}
@@ -245,39 +262,39 @@ macro_rules! impl_bitops {
}
impl Swap64 for $vec {
- #[inline]
+ #[inline(always)]
fn swap1(self) -> Self {
qmap(self, |x| {
((x & 0x5555555555555555) << 1) | ((x & 0xaaaaaaaaaaaaaaaa) >> 1)
})
}
- #[inline]
+ #[inline(always)]
fn swap2(self) -> Self {
qmap(self, |x| {
((x & 0x3333333333333333) << 2) | ((x & 0xcccccccccccccccc) >> 2)
})
}
- #[inline]
+ #[inline(always)]
fn swap4(self) -> Self {
qmap(self, |x| {
((x & 0x0f0f0f0f0f0f0f0f) << 4) | ((x & 0xf0f0f0f0f0f0f0f0) >> 4)
})
}
- #[inline]
+ #[inline(always)]
fn swap8(self) -> Self {
qmap(self, |x| {
((x & 0x00ff00ff00ff00ff) << 8) | ((x & 0xff00ff00ff00ff00) >> 8)
})
}
- #[inline]
+ #[inline(always)]
fn swap16(self) -> Self {
dmap(self, |x| x.rotate_left(16))
}
- #[inline]
+ #[inline(always)]
fn swap32(self) -> Self {
qmap(self, |x| x.rotate_left(32))
}
- #[inline]
+ #[inline(always)]
fn swap64(self) -> Self {
omap(self, |x| (x << 64) | (x >> 64))
}
@@ -289,82 +306,83 @@ impl_bitops!(u64x2_generic);
impl_bitops!(u128x1_generic);
impl RotateEachWord32 for u32x4_generic {
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right7(self) -> Self {
dmap(self, |x| x.rotate_right(7))
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right8(self) -> Self {
dmap(self, |x| x.rotate_right(8))
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right11(self) -> Self {
dmap(self, |x| x.rotate_right(11))
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right12(self) -> Self {
dmap(self, |x| x.rotate_right(12))
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right16(self) -> Self {
dmap(self, |x| x.rotate_right(16))
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right20(self) -> Self {
dmap(self, |x| x.rotate_right(20))
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right24(self) -> Self {
dmap(self, |x| x.rotate_right(24))
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right25(self) -> Self {
dmap(self, |x| x.rotate_right(25))
}
}
impl RotateEachWord32 for u64x2_generic {
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right7(self) -> Self {
qmap(self, |x| x.rotate_right(7))
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right8(self) -> Self {
qmap(self, |x| x.rotate_right(8))
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right11(self) -> Self {
qmap(self, |x| x.rotate_right(11))
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right12(self) -> Self {
qmap(self, |x| x.rotate_right(12))
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right16(self) -> Self {
qmap(self, |x| x.rotate_right(16))
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right20(self) -> Self {
qmap(self, |x| x.rotate_right(20))
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right24(self) -> Self {
qmap(self, |x| x.rotate_right(24))
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right25(self) -> Self {
qmap(self, |x| x.rotate_right(25))
}
}
impl RotateEachWord64 for u64x2_generic {
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right32(self) -> Self {
qmap(self, |x| x.rotate_right(32))
}
}
// workaround for koute/cargo-web#52 (u128::rotate_* broken with cargo web)
+#[inline(always)]
fn rotate_u128_right(x: u128, i: u32) -> u128 {
(x >> i) | (x << (128 - i))
}
@@ -375,41 +393,41 @@ fn test_rotate_u128() {
}
impl RotateEachWord32 for u128x1_generic {
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right7(self) -> Self {
Self([rotate_u128_right(self.0[0], 7)])
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right8(self) -> Self {
Self([rotate_u128_right(self.0[0], 8)])
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right11(self) -> Self {
Self([rotate_u128_right(self.0[0], 11)])
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right12(self) -> Self {
Self([rotate_u128_right(self.0[0], 12)])
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right16(self) -> Self {
Self([rotate_u128_right(self.0[0], 16)])
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right20(self) -> Self {
Self([rotate_u128_right(self.0[0], 20)])
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right24(self) -> Self {
Self([rotate_u128_right(self.0[0], 24)])
}
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right25(self) -> Self {
Self([rotate_u128_right(self.0[0], 25)])
}
}
impl RotateEachWord64 for u128x1_generic {
- #[inline]
+ #[inline(always)]
fn rotate_each_word_right32(self) -> Self {
Self([rotate_u128_right(self.0[0], 32)])
}
@@ -428,7 +446,7 @@ impl Machine for GenericMachine {
type u32x4x4 = u32x4x4_generic;
type u64x2x4 = u64x2x4_generic;
type u128x4 = u128x4_generic;
- #[inline]
+ #[inline(always)]
unsafe fn instance() -> Self {
Self
}
@@ -456,7 +474,7 @@ impl From<u64x2_generic> for vec128_storage {
impl From<u128x1_generic> for vec128_storage {
#[inline(always)]
fn from(o: u128x1_generic) -> Self {
- Self { o: o.0 }
+ Self { q: q_of_o(o.0[0]) }
}
}
@@ -475,7 +493,7 @@ impl Store<vec128_storage> for u64x2_generic {
impl Store<vec128_storage> for u128x1_generic {
#[inline(always)]
unsafe fn unpack(s: vec128_storage) -> Self {
- Self(s.o)
+ Self([o_of_q(s.q); 1])
}
}
@@ -605,6 +623,22 @@ pub type u32x4x4_generic = x4<u32x4_generic>;
pub type u64x2x4_generic = x4<u64x2_generic>;
pub type u128x4_generic = x4<u128x1_generic>;
+impl Vector<[u32; 16]> for u32x4x4_generic {
+ fn to_scalars(self) -> [u32; 16] {
+ let [a, b, c, d] = self.0;
+ let a = a.0;
+ let b = b.0;
+ let c = c.0;
+ let d = d.0;
+ [
+ a[0], a[1], a[2], a[3], //
+ b[0], b[1], b[2], b[3], //
+ c[0], c[1], c[2], c[3], //
+ d[0], d[1], d[2], d[3], //
+ ]
+ }
+}
+
impl MultiLane<[u32; 4]> for u32x4_generic {
#[inline(always)]
fn to_lanes(self) -> [u32; 4] {
@@ -745,7 +779,7 @@ impl u128x4<GenericMachine> for u128x4_generic {}
#[macro_export]
macro_rules! dispatch {
($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
- #[inline]
+ #[inline(always)]
$($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
let $mach = unsafe { $crate::generic::GenericMachine::instance() };
#[inline(always)]
@@ -762,7 +796,7 @@ macro_rules! dispatch {
#[macro_export]
macro_rules! dispatch_light128 {
($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
- #[inline]
+ #[inline(always)]
$($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
let $mach = unsafe { $crate::generic::GenericMachine::instance() };
#[inline(always)]
@@ -779,7 +813,7 @@ macro_rules! dispatch_light128 {
#[macro_export]
macro_rules! dispatch_light256 {
($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
- #[inline]
+ #[inline(always)]
$($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
let $mach = unsafe { $crate::generic::GenericMachine::instance() };
#[inline(always)]
@@ -796,7 +830,7 @@ macro_rules! dispatch_light256 {
#[macro_export]
macro_rules! dispatch_light512 {
($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
- #[inline]
+ #[inline(always)]
$($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
let $mach = unsafe { $crate::generic::GenericMachine::instance() };
#[inline(always)]
diff --git a/vendor/ppv-lite86/src/lib.rs b/vendor/ppv-lite86/src/lib.rs
index 43dc5d869..638552fc2 100644
--- a/vendor/ppv-lite86/src/lib.rs
+++ b/vendor/ppv-lite86/src/lib.rs
@@ -9,14 +9,14 @@ mod soft;
mod types;
pub use self::types::*;
-#[cfg(all(feature = "simd", target_arch = "x86_64", not(miri)))]
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(feature = "no_simd"), not(miri)))]
pub mod x86_64;
-#[cfg(all(feature = "simd", target_arch = "x86_64", not(miri)))]
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(feature = "no_simd"), not(miri)))]
use self::x86_64 as arch;
-#[cfg(any(miri, not(all(feature = "simd", any(target_arch = "x86_64")))))]
+#[cfg(any(feature = "no_simd", miri, not(target_arch = "x86_64"), all(target_arch = "x86_64", not(target_feature = "sse2"))))]
pub mod generic;
-#[cfg(any(miri, not(all(feature = "simd", any(target_arch = "x86_64")))))]
+#[cfg(any(feature = "no_simd", miri, not(target_arch = "x86_64"), all(target_arch = "x86_64", not(target_feature = "sse2"))))]
use self::generic as arch;
pub use self::arch::{vec128_storage, vec256_storage, vec512_storage};
diff --git a/vendor/ppv-lite86/src/soft.rs b/vendor/ppv-lite86/src/soft.rs
index d12dac528..0ae390c44 100644
--- a/vendor/ppv-lite86/src/soft.rs
+++ b/vendor/ppv-lite86/src/soft.rs
@@ -1,9 +1,9 @@
//! Implement 256- and 512- bit in terms of 128-bit, for machines without native wide SIMD.
-use core::marker::PhantomData;
-use core::ops::*;
use crate::types::*;
use crate::{vec128_storage, vec256_storage, vec512_storage};
+use core::marker::PhantomData;
+use core::ops::*;
#[derive(Copy, Clone, Default)]
#[allow(non_camel_case_types)]
@@ -175,26 +175,50 @@ impl<W: BSwap + Copy, G> BSwap for x2<W, G> {
impl<W: StoreBytes + BSwap + Copy, G> StoreBytes for x2<W, G> {
#[inline(always)]
unsafe fn unsafe_read_le(input: &[u8]) -> Self {
- let input = input.split_at(16);
+ let input = input.split_at(input.len() / 2);
x2::new([W::unsafe_read_le(input.0), W::unsafe_read_le(input.1)])
}
#[inline(always)]
unsafe fn unsafe_read_be(input: &[u8]) -> Self {
- x2::unsafe_read_le(input).bswap()
+ let input = input.split_at(input.len() / 2);
+ x2::new([W::unsafe_read_be(input.0), W::unsafe_read_be(input.1)])
}
#[inline(always)]
fn write_le(self, out: &mut [u8]) {
- let out = out.split_at_mut(16);
+ let out = out.split_at_mut(out.len() / 2);
self.0[0].write_le(out.0);
self.0[1].write_le(out.1);
}
#[inline(always)]
fn write_be(self, out: &mut [u8]) {
- let out = out.split_at_mut(16);
+ let out = out.split_at_mut(out.len() / 2);
self.0[0].write_be(out.0);
self.0[1].write_be(out.1);
}
}
+impl<W: Copy + LaneWords4, G: Copy> LaneWords4 for x2<W, G> {
+ #[inline(always)]
+ fn shuffle_lane_words2301(self) -> Self {
+ Self::new([
+ self.0[0].shuffle_lane_words2301(),
+ self.0[1].shuffle_lane_words2301(),
+ ])
+ }
+ #[inline(always)]
+ fn shuffle_lane_words1230(self) -> Self {
+ Self::new([
+ self.0[0].shuffle_lane_words1230(),
+ self.0[1].shuffle_lane_words1230(),
+ ])
+ }
+ #[inline(always)]
+ fn shuffle_lane_words3012(self) -> Self {
+ Self::new([
+ self.0[0].shuffle_lane_words3012(),
+ self.0[1].shuffle_lane_words3012(),
+ ])
+ }
+}
#[derive(Copy, Clone, Default)]
#[allow(non_camel_case_types)]
@@ -238,7 +262,12 @@ macro_rules! fwd_unop_x4 {
($fn:ident) => {
#[inline(always)]
fn $fn(self) -> Self {
- x4([self.0[0].$fn(), self.0[1].$fn(), self.0[2].$fn(), self.0[3].$fn()])
+ x4([
+ self.0[0].$fn(),
+ self.0[1].$fn(),
+ self.0[2].$fn(),
+ self.0[3].$fn(),
+ ])
}
};
}
@@ -305,6 +334,20 @@ impl<W: Copy> Vec4<W> for x4<W> {
self
}
}
+impl<W: Copy> Vec4Ext<W> for x4<W> {
+ #[inline(always)]
+ fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self)
+ where
+ Self: Sized,
+ {
+ (
+ x4([a.0[0], b.0[0], c.0[0], d.0[0]]),
+ x4([a.0[1], b.0[1], c.0[1], d.0[1]]),
+ x4([a.0[2], b.0[2], c.0[2], d.0[2]]),
+ x4([a.0[3], b.0[3], c.0[3], d.0[3]]),
+ )
+ }
+}
impl<W: Copy + Store<vec128_storage>> Store<vec512_storage> for x4<W> {
#[inline(always)]
unsafe fn unpack(p: vec512_storage) -> Self {
@@ -363,30 +406,39 @@ impl<W: BSwap + Copy> BSwap for x4<W> {
impl<W: StoreBytes + BSwap + Copy> StoreBytes for x4<W> {
#[inline(always)]
unsafe fn unsafe_read_le(input: &[u8]) -> Self {
+ let n = input.len() / 4;
x4([
- W::unsafe_read_le(&input[0..16]),
- W::unsafe_read_le(&input[16..32]),
- W::unsafe_read_le(&input[32..48]),
- W::unsafe_read_le(&input[48..64]),
+ W::unsafe_read_le(&input[..n]),
+ W::unsafe_read_le(&input[n..n * 2]),
+ W::unsafe_read_le(&input[n * 2..n * 3]),
+ W::unsafe_read_le(&input[n * 3..]),
])
}
#[inline(always)]
unsafe fn unsafe_read_be(input: &[u8]) -> Self {
- x4::unsafe_read_le(input).bswap()
+ let n = input.len() / 4;
+ x4([
+ W::unsafe_read_be(&input[..n]),
+ W::unsafe_read_be(&input[n..n * 2]),
+ W::unsafe_read_be(&input[n * 2..n * 3]),
+ W::unsafe_read_be(&input[n * 3..]),
+ ])
}
#[inline(always)]
fn write_le(self, out: &mut [u8]) {
- self.0[0].write_le(&mut out[0..16]);
- self.0[1].write_le(&mut out[16..32]);
- self.0[2].write_le(&mut out[32..48]);
- self.0[3].write_le(&mut out[48..64]);
+ let n = out.len() / 4;
+ self.0[0].write_le(&mut out[..n]);
+ self.0[1].write_le(&mut out[n..n * 2]);
+ self.0[2].write_le(&mut out[n * 2..n * 3]);
+ self.0[3].write_le(&mut out[n * 3..]);
}
#[inline(always)]
fn write_be(self, out: &mut [u8]) {
- self.0[0].write_be(&mut out[0..16]);
- self.0[1].write_be(&mut out[16..32]);
- self.0[2].write_be(&mut out[32..48]);
- self.0[3].write_be(&mut out[48..64]);
+ let n = out.len() / 4;
+ self.0[0].write_be(&mut out[..n]);
+ self.0[1].write_be(&mut out[n..n * 2]);
+ self.0[2].write_be(&mut out[n * 2..n * 3]);
+ self.0[3].write_be(&mut out[n * 3..]);
}
}
impl<W: Copy + LaneWords4> LaneWords4 for x4<W> {
diff --git a/vendor/ppv-lite86/src/types.rs b/vendor/ppv-lite86/src/types.rs
index 119b6bb8d..f9f3bf1ce 100644
--- a/vendor/ppv-lite86/src/types.rs
+++ b/vendor/ppv-lite86/src/types.rs
@@ -1,3 +1,4 @@
+#![allow(non_camel_case_types)]
use core::ops::{Add, AddAssign, BitAnd, BitOr, BitXor, BitXorAssign, Not};
pub trait AndNot {
@@ -44,182 +45,188 @@ pub trait RotateEachWord64 {
pub trait RotateEachWord128 {}
-#[allow(non_camel_case_types)]
-mod types {
- //! Vector type naming scheme:
- //! uN[xP]xL
- //! Unsigned; N-bit words * P bits per lane * L lanes
- //!
- //! A lane is always 128-bits, chosen because common SIMD architectures treat 128-bit units of
- //! wide vectors specially (supporting e.g. intra-lane shuffles), and tend to have limited and
- //! slow inter-lane operations.
+// Vector type naming scheme:
+// uN[xP]xL
+// Unsigned; N-bit words * P bits per lane * L lanes
+//
+// A lane is always 128-bits, chosen because common SIMD architectures treat 128-bit units of
+// wide vectors specially (supporting e.g. intra-lane shuffles), and tend to have limited and
+// slow inter-lane operations.
- use crate::arch::{vec128_storage, vec256_storage, vec512_storage};
- use crate::{ArithOps, BitOps128, BitOps32, BitOps64, Machine, Store, StoreBytes};
+use crate::arch::{vec128_storage, vec256_storage, vec512_storage};
- pub trait UnsafeFrom<T> {
- unsafe fn unsafe_from(t: T) -> Self;
- }
+#[allow(clippy::missing_safety_doc)]
+pub trait UnsafeFrom<T> {
+ unsafe fn unsafe_from(t: T) -> Self;
+}
- /// A vector composed of two elements, which may be words or themselves vectors.
- pub trait Vec2<W> {
- fn extract(self, i: u32) -> W;
- fn insert(self, w: W, i: u32) -> Self;
- }
+/// A vector composed of two elements, which may be words or themselves vectors.
+pub trait Vec2<W> {
+ fn extract(self, i: u32) -> W;
+ fn insert(self, w: W, i: u32) -> Self;
+}
- /// A vector composed of four elements, which may be words or themselves vectors.
- pub trait Vec4<W> {
- fn extract(self, i: u32) -> W;
- fn insert(self, w: W, i: u32) -> Self;
- }
+/// A vector composed of four elements, which may be words or themselves vectors.
+pub trait Vec4<W> {
+ fn extract(self, i: u32) -> W;
+ fn insert(self, w: W, i: u32) -> Self;
+}
+/// Vec4 functions which may not be implemented yet for all Vec4 types.
+/// NOTE: functions in this trait may be moved to Vec4 in any patch release. To avoid breakage,
+/// import Vec4Ext only together with Vec4, and don't qualify its methods.
+pub trait Vec4Ext<W> {
+ fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self)
+ where
+ Self: Sized;
+}
+pub trait Vector<T> {
+ fn to_scalars(self) -> T;
+}
- // TODO: multiples of 4 should inherit this
- /// A vector composed of four words; depending on their size, operations may cross lanes.
- pub trait Words4 {
- fn shuffle1230(self) -> Self;
- fn shuffle2301(self) -> Self;
- fn shuffle3012(self) -> Self;
- }
+// TODO: multiples of 4 should inherit this
+/// A vector composed of four words; depending on their size, operations may cross lanes.
+pub trait Words4 {
+ fn shuffle1230(self) -> Self;
+ fn shuffle2301(self) -> Self;
+ fn shuffle3012(self) -> Self;
+}
- /// A vector composed one or more lanes each composed of four words.
- pub trait LaneWords4 {
- fn shuffle_lane_words1230(self) -> Self;
- fn shuffle_lane_words2301(self) -> Self;
- fn shuffle_lane_words3012(self) -> Self;
- }
+/// A vector composed one or more lanes each composed of four words.
+pub trait LaneWords4 {
+ fn shuffle_lane_words1230(self) -> Self;
+ fn shuffle_lane_words2301(self) -> Self;
+ fn shuffle_lane_words3012(self) -> Self;
+}
- // TODO: make this a part of BitOps
- /// Exchange neigboring ranges of bits of the specified size
- pub trait Swap64 {
- fn swap1(self) -> Self;
- fn swap2(self) -> Self;
- fn swap4(self) -> Self;
- fn swap8(self) -> Self;
- fn swap16(self) -> Self;
- fn swap32(self) -> Self;
- fn swap64(self) -> Self;
- }
+// TODO: make this a part of BitOps
+/// Exchange neigboring ranges of bits of the specified size
+pub trait Swap64 {
+ fn swap1(self) -> Self;
+ fn swap2(self) -> Self;
+ fn swap4(self) -> Self;
+ fn swap8(self) -> Self;
+ fn swap16(self) -> Self;
+ fn swap32(self) -> Self;
+ fn swap64(self) -> Self;
+}
- pub trait u32x4<M: Machine>:
- BitOps32
- + Store<vec128_storage>
- + ArithOps
- + Vec4<u32>
- + Words4
- + LaneWords4
- + StoreBytes
- + MultiLane<[u32; 4]>
- + Into<vec128_storage>
- {
+pub trait u32x4<M: Machine>:
+ BitOps32
+ + Store<vec128_storage>
+ + ArithOps
+ + Vec4<u32>
+ + Words4
+ + LaneWords4
+ + StoreBytes
+ + MultiLane<[u32; 4]>
+ + Into<vec128_storage>
+{
}
- pub trait u64x2<M: Machine>:
- BitOps64
- + Store<vec128_storage>
- + ArithOps
- + Vec2<u64>
- + MultiLane<[u64; 2]>
- + Into<vec128_storage>
- {
+pub trait u64x2<M: Machine>:
+ BitOps64 + Store<vec128_storage> + ArithOps + Vec2<u64> + MultiLane<[u64; 2]> + Into<vec128_storage>
+{
}
- pub trait u128x1<M: Machine>:
- BitOps128 + Store<vec128_storage> + Swap64 + MultiLane<[u128; 1]> + Into<vec128_storage>
- {
+pub trait u128x1<M: Machine>:
+ BitOps128 + Store<vec128_storage> + Swap64 + MultiLane<[u128; 1]> + Into<vec128_storage>
+{
}
- pub trait u32x4x2<M: Machine>:
- BitOps32
- + Store<vec256_storage>
- + Vec2<M::u32x4>
- + MultiLane<[M::u32x4; 2]>
- + ArithOps
- + Into<vec256_storage>
- {
+pub trait u32x4x2<M: Machine>:
+ BitOps32
+ + Store<vec256_storage>
+ + Vec2<M::u32x4>
+ + MultiLane<[M::u32x4; 2]>
+ + ArithOps
+ + Into<vec256_storage>
+ + StoreBytes
+{
}
- pub trait u64x2x2<M: Machine>:
- BitOps64
- + Store<vec256_storage>
- + Vec2<M::u64x2>
- + MultiLane<[M::u64x2; 2]>
- + ArithOps
- + StoreBytes
- + Into<vec256_storage>
- {
+pub trait u64x2x2<M: Machine>:
+ BitOps64
+ + Store<vec256_storage>
+ + Vec2<M::u64x2>
+ + MultiLane<[M::u64x2; 2]>
+ + ArithOps
+ + StoreBytes
+ + Into<vec256_storage>
+{
}
- pub trait u64x4<M: Machine>:
- BitOps64
- + Store<vec256_storage>
- + Vec4<u64>
- + MultiLane<[u64; 4]>
- + ArithOps
- + Words4
- + StoreBytes
- + Into<vec256_storage>
- {
+pub trait u64x4<M: Machine>:
+ BitOps64
+ + Store<vec256_storage>
+ + Vec4<u64>
+ + MultiLane<[u64; 4]>
+ + ArithOps
+ + Words4
+ + StoreBytes
+ + Into<vec256_storage>
+{
}
- pub trait u128x2<M: Machine>:
- BitOps128
- + Store<vec256_storage>
- + Vec2<M::u128x1>
- + MultiLane<[M::u128x1; 2]>
- + Swap64
- + Into<vec256_storage>
- {
+pub trait u128x2<M: Machine>:
+ BitOps128
+ + Store<vec256_storage>
+ + Vec2<M::u128x1>
+ + MultiLane<[M::u128x1; 2]>
+ + Swap64
+ + Into<vec256_storage>
+{
}
- pub trait u32x4x4<M: Machine>:
- BitOps32
- + Store<vec512_storage>
- + Vec4<M::u32x4>
- + MultiLane<[M::u32x4; 4]>
- + ArithOps
- + LaneWords4
- + Into<vec512_storage>
- {
+pub trait u32x4x4<M: Machine>:
+ BitOps32
+ + Store<vec512_storage>
+ + Vec4<M::u32x4>
+ + Vec4Ext<M::u32x4>
+ + Vector<[u32; 16]>
+ + MultiLane<[M::u32x4; 4]>
+ + ArithOps
+ + LaneWords4
+ + Into<vec512_storage>
+ + StoreBytes
+{
}
- pub trait u64x2x4<M: Machine>:
- BitOps64
- + Store<vec512_storage>
- + Vec4<M::u64x2>
- + MultiLane<[M::u64x2; 4]>
- + ArithOps
- + Into<vec512_storage>
- {
+pub trait u64x2x4<M: Machine>:
+ BitOps64
+ + Store<vec512_storage>
+ + Vec4<M::u64x2>
+ + MultiLane<[M::u64x2; 4]>
+ + ArithOps
+ + Into<vec512_storage>
+{
}
- // TODO: Words4
- pub trait u128x4<M: Machine>:
- BitOps128
- + Store<vec512_storage>
- + Vec4<M::u128x1>
- + MultiLane<[M::u128x1; 4]>
- + Swap64
- + Into<vec512_storage>
- {
+// TODO: Words4
+pub trait u128x4<M: Machine>:
+ BitOps128
+ + Store<vec512_storage>
+ + Vec4<M::u128x1>
+ + MultiLane<[M::u128x1; 4]>
+ + Swap64
+ + Into<vec512_storage>
+{
}
- /// A vector composed of multiple 128-bit lanes.
- pub trait MultiLane<Lanes> {
- /// Split a multi-lane vector into single-lane vectors.
- fn to_lanes(self) -> Lanes;
- /// Build a multi-lane vector from individual lanes.
- fn from_lanes(lanes: Lanes) -> Self;
- }
+/// A vector composed of multiple 128-bit lanes.
+pub trait MultiLane<Lanes> {
+ /// Split a multi-lane vector into single-lane vectors.
+ fn to_lanes(self) -> Lanes;
+ /// Build a multi-lane vector from individual lanes.
+ fn from_lanes(lanes: Lanes) -> Self;
+}
- /// Combine single vectors into a multi-lane vector.
- pub trait VZip<V> {
- fn vzip(self) -> V;
- }
+/// Combine single vectors into a multi-lane vector.
+pub trait VZip<V> {
+ fn vzip(self) -> V;
+}
- impl<V, T> VZip<V> for T
- where
- V: MultiLane<T>,
- {
- #[inline(always)]
- fn vzip(self) -> V {
- V::from_lanes(self)
- }
+impl<V, T> VZip<V> for T
+where
+ V: MultiLane<T>,
+{
+ #[inline(always)]
+ fn vzip(self) -> V {
+ V::from_lanes(self)
}
}
-pub use self::types::*;
pub trait Machine: Sized + Copy {
type u32x4: u32x4<Self>;
@@ -264,15 +271,27 @@ pub trait Machine: Sized + Copy {
unsafe { V::unsafe_read_be(input) }
}
+ /// # Safety
+ /// Caller must ensure the type of Self is appropriate for the hardware of the execution
+ /// environment.
unsafe fn instance() -> Self;
}
pub trait Store<S> {
+ /// # Safety
+ /// Caller must ensure the type of Self is appropriate for the hardware of the execution
+ /// environment.
unsafe fn unpack(p: S) -> Self;
}
pub trait StoreBytes {
+ /// # Safety
+ /// Caller must ensure the type of Self is appropriate for the hardware of the execution
+ /// environment.
unsafe fn unsafe_read_le(input: &[u8]) -> Self;
+ /// # Safety
+ /// Caller must ensure the type of Self is appropriate for the hardware of the execution
+ /// environment.
unsafe fn unsafe_read_be(input: &[u8]) -> Self;
fn write_le(self, out: &mut [u8]);
fn write_be(self, out: &mut [u8]);
diff --git a/vendor/ppv-lite86/src/x86_64/mod.rs b/vendor/ppv-lite86/src/x86_64/mod.rs
index ecf184f36..937732da3 100644
--- a/vendor/ppv-lite86/src/x86_64/mod.rs
+++ b/vendor/ppv-lite86/src/x86_64/mod.rs
@@ -1,7 +1,7 @@
// crate minimums: sse2, x86_64
-use core::arch::x86_64::{__m128i, __m256i};
use crate::types::*;
+use core::arch::x86_64::{__m128i, __m256i};
mod sse2;
@@ -79,7 +79,7 @@ where
type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;
- type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>;
+ type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>;
type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;
@@ -119,16 +119,16 @@ impl Store<vec128_storage> for vec128_storage {
p
}
}
-impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage {
+impl<'a> From<&'a vec128_storage> for &'a [u32; 4] {
#[inline(always)]
- fn into(self) -> &'a [u32; 4] {
- unsafe { &self.u32x4 }
+ fn from(x: &'a vec128_storage) -> Self {
+ unsafe { &x.u32x4 }
}
}
-impl Into<vec128_storage> for [u32; 4] {
+impl From<[u32; 4]> for vec128_storage {
#[inline(always)]
- fn into(self) -> vec128_storage {
- vec128_storage { u32x4: self }
+ fn from(u32x4: [u32; 4]) -> Self {
+ vec128_storage { u32x4 }
}
}
impl Default for vec128_storage {
@@ -154,10 +154,10 @@ pub union vec256_storage {
sse2: [vec128_storage; 2],
avx: __m256i,
}
-impl Into<vec256_storage> for [u64; 4] {
+impl From<[u64; 4]> for vec256_storage {
#[inline(always)]
- fn into(self) -> vec256_storage {
- vec256_storage { u64x4: self }
+ fn from(u64x4: [u64; 4]) -> Self {
+ vec256_storage { u64x4 }
}
}
impl Default for vec256_storage {
@@ -167,9 +167,11 @@ impl Default for vec256_storage {
}
}
impl vec256_storage {
+ #[inline(always)]
pub fn new128(xs: [vec128_storage; 2]) -> Self {
Self { sse2: xs }
}
+ #[inline(always)]
pub fn split128(self) -> [vec128_storage; 2] {
unsafe { self.sse2 }
}
@@ -200,9 +202,11 @@ impl Default for vec512_storage {
}
}
impl vec512_storage {
+ #[inline(always)]
pub fn new128(xs: [vec128_storage; 4]) -> Self {
Self { sse2: xs }
}
+ #[inline(always)]
pub fn split128(self) -> [vec128_storage; 4] {
unsafe { self.sse2 }
}
@@ -217,10 +221,10 @@ impl PartialEq for vec512_storage {
macro_rules! impl_into {
($storage:ident, $array:ty, $name:ident) => {
- impl Into<$array> for $storage {
+ impl From<$storage> for $array {
#[inline(always)]
- fn into(self) -> $array {
- unsafe { self.$name }
+ fn from(vec: $storage) -> Self {
+ unsafe { vec.$name }
}
}
};
diff --git a/vendor/ppv-lite86/src/x86_64/sse2.rs b/vendor/ppv-lite86/src/x86_64/sse2.rs
index 60e7681c3..97197a436 100644
--- a/vendor/ppv-lite86/src/x86_64/sse2.rs
+++ b/vendor/ppv-lite86/src/x86_64/sse2.rs
@@ -166,49 +166,44 @@ macro_rules! impl_bitops128 {
macro_rules! rotr_32_s3 {
($name:ident, $k0:expr, $k1:expr) => {
- #[inline(always)]
- fn $name(self) -> Self {
- Self::new(unsafe {
- _mm_shuffle_epi8(
- self.x,
- _mm_set_epi64x($k0, $k1),
- )
- })
+ #[inline(always)]
+ fn $name(self) -> Self {
+ Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
}
};
}
macro_rules! rotr_32 {
($name:ident, $i:expr) => {
- #[inline(always)]
- fn $name(self) -> Self {
- Self::new(unsafe {
- _mm_or_si128(
- _mm_srli_epi32(self.x, $i as i32),
- _mm_slli_epi32(self.x, 32 - $i as i32),
- )
- })
- }
+ #[inline(always)]
+ fn $name(self) -> Self {
+ Self::new(unsafe {
+ _mm_or_si128(
+ _mm_srli_epi32(self.x, $i as i32),
+ _mm_slli_epi32(self.x, 32 - $i as i32),
+ )
+ })
+ }
};
}
impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
rotr_32!(rotate_each_word_right7, 7);
rotr_32_s3!(
rotate_each_word_right8,
- 0x0c0f0e0d_080b0a09,
- 0x04070605_00030201
+ 0x0c0f_0e0d_080b_0a09,
+ 0x0407_0605_0003_0201
);
rotr_32!(rotate_each_word_right11, 11);
rotr_32!(rotate_each_word_right12, 12);
rotr_32_s3!(
rotate_each_word_right16,
- 0x0d0c0f0e_09080b0a,
- 0x05040706_01000302
+ 0x0d0c_0f0e_0908_0b0a,
+ 0x0504_0706_0100_0302
);
rotr_32!(rotate_each_word_right20, 20);
rotr_32_s3!(
rotate_each_word_right24,
- 0x0e0d0c0f_0a09080b,
- 0x06050407_02010003
+ 0x0e0d_0c0f_0a09_080b,
+ 0x0605_0407_0201_0003
);
rotr_32!(rotate_each_word_right25, 25);
}
@@ -228,28 +223,23 @@ impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
macro_rules! rotr_64_s3 {
($name:ident, $k0:expr, $k1:expr) => {
- #[inline(always)]
- fn $name(self) -> Self {
- Self::new(unsafe {
- _mm_shuffle_epi8(
- self.x,
- _mm_set_epi64x($k0, $k1),
- )
- })
+ #[inline(always)]
+ fn $name(self) -> Self {
+ Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
}
};
}
macro_rules! rotr_64 {
($name:ident, $i:expr) => {
- #[inline(always)]
- fn $name(self) -> Self {
- Self::new(unsafe {
- _mm_or_si128(
- _mm_srli_epi64(self.x, $i as i32),
- _mm_slli_epi64(self.x, 64 - $i as i32),
- )
- })
- }
+ #[inline(always)]
+ fn $name(self) -> Self {
+ Self::new(unsafe {
+ _mm_or_si128(
+ _mm_srli_epi64(self.x, $i as i32),
+ _mm_slli_epi64(self.x, 64 - $i as i32),
+ )
+ })
+ }
};
}
impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
@@ -296,15 +286,15 @@ impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
macro_rules! rotr_128 {
($name:ident, $i:expr) => {
- #[inline(always)]
- fn $name(self) -> Self {
- Self::new(unsafe {
- _mm_or_si128(
- _mm_srli_si128(self.x, $i as i32),
- _mm_slli_si128(self.x, 128 - $i as i32),
- )
- })
- }
+ #[inline(always)]
+ fn $name(self) -> Self {
+ Self::new(unsafe {
+ _mm_or_si128(
+ _mm_srli_si128(self.x, $i as i32),
+ _mm_slli_si128(self.x, 128 - $i as i32),
+ )
+ })
+ }
};
}
// TODO: completely unoptimized
@@ -411,7 +401,7 @@ impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> {
}
#[inline(always)]
fn from_lanes(xs: [u128; 1]) -> Self {
- unimplemented!()
+ unimplemented!("{:?}", xs)
}
}
@@ -780,7 +770,7 @@ impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
#[inline(always)]
fn bswap(self) -> Self {
- Self::new(unsafe { unimplemented!() })
+ unimplemented!()
}
}
@@ -890,6 +880,13 @@ pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
#[allow(non_camel_case_types)]
pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
+impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> {
+ #[inline(always)]
+ fn to_scalars(self) -> [u32; 16] {
+ unsafe { core::mem::transmute(self) }
+ }
+}
+
impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
where
u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
@@ -993,6 +990,8 @@ where
Machine86<S3, S4, NI>: Machine,
u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
+ u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>,
+ u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>,
{
}
impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
@@ -1014,14 +1013,6 @@ where
{
}
-impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_sse2<YesS3, YesS4, NI>
-where
- u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
- Avx2Machine<NI>: Machine,
- u32x4x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 4]>,
- u32x4x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u32x4>,
-{
-}
impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
where
u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
@@ -1078,6 +1069,7 @@ impl<W: PartialEq, G> PartialEq for x2<W, G> {
}
}
+#[allow(unused)]
#[inline(always)]
unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110);
@@ -1383,65 +1375,78 @@ mod test {
pub mod avx2 {
#![allow(non_camel_case_types)]
- use crate::soft::x4;
+ use crate::soft::{x2, x4};
use crate::types::*;
- use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2};
+ use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0};
use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
use core::arch::x86_64::*;
use core::marker::PhantomData;
use core::ops::*;
#[derive(Copy, Clone)]
- pub struct u32x4x4_avx2<NI> {
- x: [__m256i; 2],
+ pub struct u32x4x2_avx2<NI> {
+ x: __m256i,
ni: PhantomData<NI>,
}
- impl<NI> u32x4x4_avx2<NI> {
+ impl<NI> u32x4x2_avx2<NI> {
#[inline(always)]
- fn new(x: [__m256i; 2]) -> Self {
+ fn new(x: __m256i) -> Self {
Self { x, ni: PhantomData }
}
}
- impl<NI> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> where NI: Copy {}
- impl<NI> Store<vec512_storage> for u32x4x4_avx2<NI> {
+ impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {}
+ impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> {
#[inline(always)]
- unsafe fn unpack(p: vec512_storage) -> Self {
- Self::new([p.avx[0].avx, p.avx[1].avx])
+ unsafe fn unpack(p: vec256_storage) -> Self {
+ Self::new(p.avx)
}
}
- impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
+ impl<NI> StoreBytes for u32x4x2_avx2<NI> {
#[inline(always)]
- fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
+ unsafe fn unsafe_read_le(input: &[u8]) -> Self {
+ assert_eq!(input.len(), 32);
+ Self::new(_mm256_loadu_si256(input.as_ptr() as *const _))
+ }
+ #[inline(always)]
+ unsafe fn unsafe_read_be(input: &[u8]) -> Self {
+ Self::unsafe_read_le(input).bswap()
+ }
+ #[inline(always)]
+ fn write_le(self, out: &mut [u8]) {
unsafe {
- [
- u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
- u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
- u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
- u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
- ]
+ assert_eq!(out.len(), 32);
+ _mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x)
}
}
#[inline(always)]
- fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
- Self::new(unsafe {
+ fn write_be(self, out: &mut [u8]) {
+ self.bswap().write_le(out)
+ }
+ }
+ impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 2]> for u32x4x2_avx2<NI> {
+ #[inline(always)]
+ fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 2] {
+ unsafe {
[
- _mm256_setr_m128i(x[0].x, x[1].x),
- _mm256_setr_m128i(x[2].x, x[3].x),
+ u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
+ u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
]
- })
+ }
+ }
+ #[inline(always)]
+ fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 2]) -> Self {
+ Self::new(unsafe { _mm256_setr_m128i(x[0].x, x[1].x) })
}
}
- impl<NI> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
+ impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> {
#[inline(always)]
fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
unsafe {
match i {
- 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
- 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
- 2 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
- 3 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
+ 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
+ 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
_ => panic!(),
}
}
@@ -1450,61 +1455,21 @@ pub mod avx2 {
fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
Self::new(unsafe {
match i {
- 0 => [_mm256_inserti128_si256(self.x[0], w.x, 0), self.x[1]],
- 1 => [_mm256_inserti128_si256(self.x[0], w.x, 1), self.x[1]],
- 2 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 0)],
- 3 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 1)],
+ 0 => _mm256_inserti128_si256(self.x, w.x, 0),
+ 1 => _mm256_inserti128_si256(self.x, w.x, 1),
_ => panic!(),
}
})
}
}
- impl<NI> LaneWords4 for u32x4x4_avx2<NI> {
- #[inline(always)]
- fn shuffle_lane_words1230(self) -> Self {
- Self::new(unsafe {
- [
- _mm256_shuffle_epi32(self.x[0], 0b1001_0011),
- _mm256_shuffle_epi32(self.x[1], 0b1001_0011),
- ]
- })
- }
- #[inline(always)]
- fn shuffle_lane_words2301(self) -> Self {
- Self::new(unsafe {
- [
- _mm256_shuffle_epi32(self.x[0], 0b0100_1110),
- _mm256_shuffle_epi32(self.x[1], 0b0100_1110),
- ]
- })
- }
- #[inline(always)]
- fn shuffle_lane_words3012(self) -> Self {
- Self::new(unsafe {
- [
- _mm256_shuffle_epi32(self.x[0], 0b0011_1001),
- _mm256_shuffle_epi32(self.x[1], 0b0011_1001),
- ]
- })
- }
- }
- impl<NI> BitOps32 for u32x4x4_avx2<NI> where NI: Copy {}
- impl<NI> ArithOps for u32x4x4_avx2<NI> where NI: Copy {}
+ impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {}
+ impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {}
macro_rules! shuf_lane_bytes {
($name:ident, $k0:expr, $k1:expr) => {
- #[inline(always)]
- fn $name(self) -> Self {
- Self::new(unsafe {
- [
- _mm256_shuffle_epi8(
- self.x[0],
- _mm256_set_epi64x($k0, $k1, $k0, $k1),
- ),
- _mm256_shuffle_epi8(
- self.x[1],
- _mm256_set_epi64x($k0, $k1, $k0, $k1),
- )
- ]
+ #[inline(always)]
+ fn $name(self) -> Self {
+ Self::new(unsafe {
+ _mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1))
})
}
};
@@ -1514,52 +1479,41 @@ pub mod avx2 {
#[inline(always)]
fn $name(self) -> Self {
Self::new(unsafe {
- [
- _mm256_or_si256(
- _mm256_srli_epi32(self.x[0], $i as i32),
- _mm256_slli_epi32(self.x[0], 32 - $i as i32),
- ),
- _mm256_or_si256(
- _mm256_srli_epi32(self.x[1], $i as i32),
- _mm256_slli_epi32(self.x[1], 32 - $i as i32),
- )
- ]
+ _mm256_or_si256(
+ _mm256_srli_epi32(self.x, $i as i32),
+ _mm256_slli_epi32(self.x, 32 - $i as i32),
+ )
})
}
};
}
- impl<NI: Copy> RotateEachWord32 for u32x4x4_avx2<NI> {
+ impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> {
rotr_32!(rotate_each_word_right7, 7);
shuf_lane_bytes!(
rotate_each_word_right8,
- 0x0c0f0e0d_080b0a09,
- 0x04070605_00030201
+ 0x0c0f_0e0d_080b_0a09,
+ 0x0407_0605_0003_0201
);
rotr_32!(rotate_each_word_right11, 11);
rotr_32!(rotate_each_word_right12, 12);
shuf_lane_bytes!(
rotate_each_word_right16,
- 0x0d0c0f0e_09080b0a,
- 0x05040706_01000302
+ 0x0d0c_0f0e_0908_0b0a,
+ 0x0504_0706_0100_0302
);
rotr_32!(rotate_each_word_right20, 20);
shuf_lane_bytes!(
rotate_each_word_right24,
- 0x0e0d0c0f_0a09080b,
- 0x06050407_02010003
+ 0x0e0d_0c0f_0a09_080b,
+ 0x0605_0407_0201_0003
);
rotr_32!(rotate_each_word_right25, 25);
}
- impl<NI> BitOps0 for u32x4x4_avx2<NI> where NI: Copy {}
- impl<NI> From<u32x4x4_avx2<NI>> for vec512_storage {
+ impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {}
+ impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage {
#[inline(always)]
- fn from(x: u32x4x4_avx2<NI>) -> Self {
- Self {
- avx: [
- vec256_storage { avx: x.x[0] },
- vec256_storage { avx: x.x[1] },
- ],
- }
+ fn from(x: u32x4x2_avx2<NI>) -> Self {
+ Self { avx: x.x }
}
}
@@ -1576,55 +1530,172 @@ pub mod avx2 {
}
};
}
- impl_assign!(u32x4x4_avx2, BitXorAssign, bitxor_assign, bitxor);
- impl_assign!(u32x4x4_avx2, BitOrAssign, bitor_assign, bitor);
- impl_assign!(u32x4x4_avx2, BitAndAssign, bitand_assign, bitand);
- impl_assign!(u32x4x4_avx2, AddAssign, add_assign, add);
+ impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor);
+ impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor);
+ impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand);
+ impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add);
- macro_rules! impl_bitop_x2 {
+ macro_rules! impl_bitop {
($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
impl<NI> $Op for $vec<NI> {
type Output = Self;
#[inline(always)]
fn $op_fn(self, rhs: Self) -> Self::Output {
- Self::new(unsafe {
- [$impl_fn(self.x[0], rhs.x[0]), $impl_fn(self.x[1], rhs.x[1])]
- })
+ Self::new(unsafe { $impl_fn(self.x, rhs.x) })
}
}
};
}
- impl_bitop_x2!(u32x4x4_avx2, BitXor, bitxor, _mm256_xor_si256);
- impl_bitop_x2!(u32x4x4_avx2, BitOr, bitor, _mm256_or_si256);
- impl_bitop_x2!(u32x4x4_avx2, BitAnd, bitand, _mm256_and_si256);
- impl_bitop_x2!(u32x4x4_avx2, AndNot, andnot, _mm256_andnot_si256);
- impl_bitop_x2!(u32x4x4_avx2, Add, add, _mm256_add_epi32);
+ impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256);
+ impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256);
+ impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256);
+ impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256);
+ impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32);
- impl<NI> Not for u32x4x4_avx2<NI> {
+ impl<NI> Not for u32x4x2_avx2<NI> {
type Output = Self;
#[inline(always)]
fn not(self) -> Self::Output {
unsafe {
let f = _mm256_set1_epi8(-0x7f);
- Self::new([f, f]) ^ self
+ Self::new(f) ^ self
}
}
}
- impl<NI> BSwap for u32x4x4_avx2<NI> {
+ impl<NI> BSwap for u32x4x2_avx2<NI> {
shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
}
- impl<NI> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI>
+ impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI>
where
NI: Copy,
{
#[inline(always)]
+ fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self {
+ Self::new(unsafe { _mm256_setr_m128i(x.0[0].x, x.0[1].x) })
+ }
+ }
+
+ impl<NI> LaneWords4 for u32x4x2_avx2<NI> {
+ #[inline(always)]
+ fn shuffle_lane_words1230(self) -> Self {
+ Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b1001_0011) })
+ }
+ #[inline(always)]
+ fn shuffle_lane_words2301(self) -> Self {
+ Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0100_1110) })
+ }
+ #[inline(always)]
+ fn shuffle_lane_words3012(self) -> Self {
+ Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0011_1001) })
+ }
+ }
+
+ ///////////////////////////////////////////////////////////////////////////////////////////
+
+ pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>;
+ impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {}
+
+ impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> {
+ #[inline(always)]
+ unsafe fn unpack(p: vec512_storage) -> Self {
+ Self::new([
+ u32x4x2_avx2::unpack(p.avx[0]),
+ u32x4x2_avx2::unpack(p.avx[1]),
+ ])
+ }
+ }
+ impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
+ #[inline(always)]
+ fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
+ let [a, b] = self.0[0].to_lanes();
+ let [c, d] = self.0[1].to_lanes();
+ [a, b, c, d]
+ }
+ #[inline(always)]
+ fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
+ let ab = u32x4x2_avx2::from_lanes([x[0], x[1]]);
+ let cd = u32x4x2_avx2::from_lanes([x[2], x[3]]);
+ Self::new([ab, cd])
+ }
+ }
+ impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
+ #[inline(always)]
+ fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
+ match i {
+ 0 => self.0[0].extract(0),
+ 1 => self.0[0].extract(1),
+ 2 => self.0[1].extract(0),
+ 3 => self.0[1].extract(1),
+ _ => panic!(),
+ }
+ }
+ #[inline(always)]
+ fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
+ Self::new(match i {
+ 0 | 1 => [self.0[0].insert(w, i), self.0[1]],
+ 2 | 3 => [self.0[0], self.0[1].insert(w, i - 2)],
+ _ => panic!(),
+ })
+ }
+ }
+ impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
+ #[inline(always)]
+ fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
+ /*
+ * a00:a01 a10:a11
+ * b00:b01 b10:b11
+ * c00:c01 c10:c11
+ * d00:d01 d10:d11
+ * =>
+ * a00:b00 c00:d00
+ * a01:b01 c01:d01
+ * a10:b10 c10:d10
+ * a11:b11 c11:d11
+ */
+ unsafe {
+ let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x20));
+ let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x31));
+ let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x20));
+ let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x31));
+ let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x20));
+ let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x31));
+ let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x20));
+ let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x31));
+ (
+ Self::new([ab00, cd00]),
+ Self::new([ab01, cd01]),
+ Self::new([ab10, cd10]),
+ Self::new([ab11, cd11]),
+ )
+ }
+ }
+ }
+ impl<NI: Copy> Vector<[u32; 16]> for u32x4x4_avx2<NI> {
+ #[inline(always)]
+ fn to_scalars(self) -> [u32; 16] {
+ unsafe { core::mem::transmute(self) }
+ }
+ }
+ impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage {
+ #[inline(always)]
+ fn from(x: u32x4x4_avx2<NI>) -> Self {
+ Self {
+ avx: [
+ vec256_storage { avx: x.0[0].x },
+ vec256_storage { avx: x.0[1].x },
+ ],
+ }
+ }
+ }
+ impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> {
+ #[inline(always)]
fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
Self::new(unsafe {
[
- _mm256_setr_m128i(x.0[0].x, x.0[1].x),
- _mm256_setr_m128i(x.0[2].x, x.0[3].x),
+ u32x4x2_avx2::new(_mm256_setr_m128i(x.0[0].x, x.0[1].x)),
+ u32x4x2_avx2::new(_mm256_setr_m128i(x.0[2].x, x.0[3].x)),
]
})
}