summaryrefslogtreecommitdiffstats
path: root/third_party/rust/packed_simd_2/src/codegen
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 09:22:09 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 09:22:09 +0000
commit43a97878ce14b72f0981164f87f2e35e14151312 (patch)
tree620249daf56c0258faa40cbdcf9cfba06de2a846 /third_party/rust/packed_simd_2/src/codegen
parentInitial commit. (diff)
downloadfirefox-43a97878ce14b72f0981164f87f2e35e14151312.tar.xz
firefox-43a97878ce14b72f0981164f87f2e35e14151312.zip
Adding upstream version 110.0.1.upstream/110.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/packed_simd_2/src/codegen')
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/bit_manip.rs347
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/llvm.rs128
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/math.rs3
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/math/float.rs18
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/math/float/abs.rs103
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/math/float/cos.rs103
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/math/float/cos_pi.rs87
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/math/float/exp.rs112
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/math/float/ln.rs112
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/math/float/macros.rs470
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/math/float/mul_add.rs109
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/math/float/mul_adde.rs60
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/math/float/powf.rs112
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/math/float/sin.rs103
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/math/float/sin_cos_pi.rs188
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/math/float/sin_pi.rs87
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/math/float/sqrt.rs103
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/math/float/sqrte.rs67
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/pointer_sized_int.rs28
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/reductions.rs1
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/reductions/mask.rs69
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/reductions/mask/aarch64.rs81
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/reductions/mask/arm.rs56
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/reductions/mask/fallback.rs8
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/reductions/mask/fallback_impl.rs237
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86.rs216
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86/avx.rs95
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86/avx2.rs35
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86/sse.rs35
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86/sse2.rs68
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/shuffle.rs150
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/shuffle1_dyn.rs408
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/swap_bytes.rs149
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/v128.rs46
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/v16.rs7
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/v256.rs78
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/v32.rs11
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/v512.rs145
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/v64.rs21
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/vPtr.rs35
-rw-r--r--third_party/rust/packed_simd_2/src/codegen/vSize.rs16
41 files changed, 4307 insertions, 0 deletions
diff --git a/third_party/rust/packed_simd_2/src/codegen/bit_manip.rs b/third_party/rust/packed_simd_2/src/codegen/bit_manip.rs
new file mode 100644
index 0000000000..32d8d717a0
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/bit_manip.rs
@@ -0,0 +1,347 @@
+//! LLVM bit manipulation intrinsics.
+#[rustfmt::skip]
+
+pub(crate) use crate::*;
+
+#[allow(improper_ctypes, dead_code)]
+extern "C" {
+ #[link_name = "llvm.ctlz.v2i8"]
+ fn ctlz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x2;
+ #[link_name = "llvm.ctlz.v4i8"]
+ fn ctlz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x4;
+ #[link_name = "llvm.ctlz.v8i8"]
+ fn ctlz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x8;
+ #[link_name = "llvm.ctlz.v16i8"]
+ fn ctlz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x16;
+ #[link_name = "llvm.ctlz.v32i8"]
+ fn ctlz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x32;
+ #[link_name = "llvm.ctlz.v64i8"]
+ fn ctlz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x64;
+
+ #[link_name = "llvm.ctlz.v2i16"]
+ fn ctlz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x2;
+ #[link_name = "llvm.ctlz.v4i16"]
+ fn ctlz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x4;
+ #[link_name = "llvm.ctlz.v8i16"]
+ fn ctlz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x8;
+ #[link_name = "llvm.ctlz.v16i16"]
+ fn ctlz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x16;
+ #[link_name = "llvm.ctlz.v32i16"]
+ fn ctlz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x32;
+
+ #[link_name = "llvm.ctlz.v2i32"]
+ fn ctlz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x2;
+ #[link_name = "llvm.ctlz.v4i32"]
+ fn ctlz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x4;
+ #[link_name = "llvm.ctlz.v8i32"]
+ fn ctlz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x8;
+ #[link_name = "llvm.ctlz.v16i32"]
+ fn ctlz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x16;
+
+ #[link_name = "llvm.ctlz.v2i64"]
+ fn ctlz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x2;
+ #[link_name = "llvm.ctlz.v4i64"]
+ fn ctlz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x4;
+ #[link_name = "llvm.ctlz.v8i64"]
+ fn ctlz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x8;
+
+ #[link_name = "llvm.ctlz.v1i128"]
+ fn ctlz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x1;
+ #[link_name = "llvm.ctlz.v2i128"]
+ fn ctlz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x2;
+ #[link_name = "llvm.ctlz.v4i128"]
+ fn ctlz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4;
+
+ #[link_name = "llvm.cttz.v2i8"]
+ fn cttz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x2;
+ #[link_name = "llvm.cttz.v4i8"]
+ fn cttz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x4;
+ #[link_name = "llvm.cttz.v8i8"]
+ fn cttz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x8;
+ #[link_name = "llvm.cttz.v16i8"]
+ fn cttz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x16;
+ #[link_name = "llvm.cttz.v32i8"]
+ fn cttz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x32;
+ #[link_name = "llvm.cttz.v64i8"]
+ fn cttz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x64;
+
+ #[link_name = "llvm.cttz.v2i16"]
+ fn cttz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x2;
+ #[link_name = "llvm.cttz.v4i16"]
+ fn cttz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x4;
+ #[link_name = "llvm.cttz.v8i16"]
+ fn cttz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x8;
+ #[link_name = "llvm.cttz.v16i16"]
+ fn cttz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x16;
+ #[link_name = "llvm.cttz.v32i16"]
+ fn cttz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x32;
+
+ #[link_name = "llvm.cttz.v2i32"]
+ fn cttz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x2;
+ #[link_name = "llvm.cttz.v4i32"]
+ fn cttz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x4;
+ #[link_name = "llvm.cttz.v8i32"]
+ fn cttz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x8;
+ #[link_name = "llvm.cttz.v16i32"]
+ fn cttz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x16;
+
+ #[link_name = "llvm.cttz.v2i64"]
+ fn cttz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x2;
+ #[link_name = "llvm.cttz.v4i64"]
+ fn cttz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x4;
+ #[link_name = "llvm.cttz.v8i64"]
+ fn cttz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x8;
+
+ #[link_name = "llvm.cttz.v1i128"]
+ fn cttz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x1;
+ #[link_name = "llvm.cttz.v2i128"]
+ fn cttz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x2;
+ #[link_name = "llvm.cttz.v4i128"]
+ fn cttz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4;
+
+ #[link_name = "llvm.ctpop.v2i8"]
+ fn ctpop_u8x2(x: u8x2) -> u8x2;
+ #[link_name = "llvm.ctpop.v4i8"]
+ fn ctpop_u8x4(x: u8x4) -> u8x4;
+ #[link_name = "llvm.ctpop.v8i8"]
+ fn ctpop_u8x8(x: u8x8) -> u8x8;
+ #[link_name = "llvm.ctpop.v16i8"]
+ fn ctpop_u8x16(x: u8x16) -> u8x16;
+ #[link_name = "llvm.ctpop.v32i8"]
+ fn ctpop_u8x32(x: u8x32) -> u8x32;
+ #[link_name = "llvm.ctpop.v64i8"]
+ fn ctpop_u8x64(x: u8x64) -> u8x64;
+
+ #[link_name = "llvm.ctpop.v2i16"]
+ fn ctpop_u16x2(x: u16x2) -> u16x2;
+ #[link_name = "llvm.ctpop.v4i16"]
+ fn ctpop_u16x4(x: u16x4) -> u16x4;
+ #[link_name = "llvm.ctpop.v8i16"]
+ fn ctpop_u16x8(x: u16x8) -> u16x8;
+ #[link_name = "llvm.ctpop.v16i16"]
+ fn ctpop_u16x16(x: u16x16) -> u16x16;
+ #[link_name = "llvm.ctpop.v32i16"]
+ fn ctpop_u16x32(x: u16x32) -> u16x32;
+
+ #[link_name = "llvm.ctpop.v2i32"]
+ fn ctpop_u32x2(x: u32x2) -> u32x2;
+ #[link_name = "llvm.ctpop.v4i32"]
+ fn ctpop_u32x4(x: u32x4) -> u32x4;
+ #[link_name = "llvm.ctpop.v8i32"]
+ fn ctpop_u32x8(x: u32x8) -> u32x8;
+ #[link_name = "llvm.ctpop.v16i32"]
+ fn ctpop_u32x16(x: u32x16) -> u32x16;
+
+ #[link_name = "llvm.ctpop.v2i64"]
+ fn ctpop_u64x2(x: u64x2) -> u64x2;
+ #[link_name = "llvm.ctpop.v4i64"]
+ fn ctpop_u64x4(x: u64x4) -> u64x4;
+ #[link_name = "llvm.ctpop.v8i64"]
+ fn ctpop_u64x8(x: u64x8) -> u64x8;
+
+ #[link_name = "llvm.ctpop.v1i128"]
+ fn ctpop_u128x1(x: u128x1) -> u128x1;
+ #[link_name = "llvm.ctpop.v2i128"]
+ fn ctpop_u128x2(x: u128x2) -> u128x2;
+ #[link_name = "llvm.ctpop.v4i128"]
+ fn ctpop_u128x4(x: u128x4) -> u128x4;
+}
+
+pub(crate) trait BitManip {
+ fn ctpop(self) -> Self;
+ fn ctlz(self) -> Self;
+ fn cttz(self) -> Self;
+}
+
+macro_rules! impl_bit_manip {
+ (inner: $ty:ident, $scalar:ty, $uty:ident,
+ $ctpop:ident, $ctlz:ident, $cttz:ident) => {
+ // FIXME: several LLVM intrinsics break on s390x https://github.com/rust-lang-nursery/packed_simd/issues/192
+ #[cfg(target_arch = "s390x")]
+ impl_bit_manip! { scalar: $ty, $scalar }
+ #[cfg(not(target_arch = "s390x"))]
+ impl BitManip for $ty {
+ #[inline]
+ fn ctpop(self) -> Self {
+ let y: $uty = self.cast();
+ unsafe { $ctpop(y).cast() }
+ }
+
+ #[inline]
+ fn ctlz(self) -> Self {
+ let y: $uty = self.cast();
+ // the ctxx intrinsics need compile-time constant
+ // `is_zero_undef`
+ unsafe { $ctlz(y, false).cast() }
+ }
+
+ #[inline]
+ fn cttz(self) -> Self {
+ let y: $uty = self.cast();
+ unsafe { $cttz(y, false).cast() }
+ }
+ }
+ };
+ (sized_inner: $ty:ident, $scalar:ty, $uty:ident) => {
+ #[cfg(target_arch = "s390x")]
+ impl_bit_manip! { scalar: $ty, $scalar }
+ #[cfg(not(target_arch = "s390x"))]
+ impl BitManip for $ty {
+ #[inline]
+ fn ctpop(self) -> Self {
+ let y: $uty = self.cast();
+ $uty::ctpop(y).cast()
+ }
+
+ #[inline]
+ fn ctlz(self) -> Self {
+ let y: $uty = self.cast();
+ $uty::ctlz(y).cast()
+ }
+
+ #[inline]
+ fn cttz(self) -> Self {
+ let y: $uty = self.cast();
+ $uty::cttz(y).cast()
+ }
+ }
+ };
+ (scalar: $ty:ident, $scalar:ty) => {
+ impl BitManip for $ty {
+ #[inline]
+ fn ctpop(self) -> Self {
+ let mut ones = self;
+ for i in 0..Self::lanes() {
+ ones = ones.replace(i, self.extract(i).count_ones() as $scalar);
+ }
+ ones
+ }
+
+ #[inline]
+ fn ctlz(self) -> Self {
+ let mut lz = self;
+ for i in 0..Self::lanes() {
+ lz = lz.replace(i, self.extract(i).leading_zeros() as $scalar);
+ }
+ lz
+ }
+
+ #[inline]
+ fn cttz(self) -> Self {
+ let mut tz = self;
+ for i in 0..Self::lanes() {
+ tz = tz.replace(i, self.extract(i).trailing_zeros() as $scalar);
+ }
+ tz
+ }
+ }
+ };
+ ($uty:ident, $uscalar:ty, $ity:ident, $iscalar:ty,
+ $ctpop:ident, $ctlz:ident, $cttz:ident) => {
+ impl_bit_manip! { inner: $uty, $uscalar, $uty, $ctpop, $ctlz, $cttz }
+ impl_bit_manip! { inner: $ity, $iscalar, $uty, $ctpop, $ctlz, $cttz }
+ };
+ (sized: $usize:ident, $uscalar:ty, $isize:ident,
+ $iscalar:ty, $ty:ident) => {
+ impl_bit_manip! { sized_inner: $usize, $uscalar, $ty }
+ impl_bit_manip! { sized_inner: $isize, $iscalar, $ty }
+ };
+}
+
+impl_bit_manip! { u8x2 , u8, i8x2, i8, ctpop_u8x2, ctlz_u8x2, cttz_u8x2 }
+impl_bit_manip! { u8x4 , u8, i8x4, i8, ctpop_u8x4, ctlz_u8x4, cttz_u8x4 }
+#[cfg(not(target_arch = "aarch64"))] // see below
+impl_bit_manip! { u8x8 , u8, i8x8, i8, ctpop_u8x8, ctlz_u8x8, cttz_u8x8 }
+impl_bit_manip! { u8x16 , u8, i8x16, i8, ctpop_u8x16, ctlz_u8x16, cttz_u8x16 }
+impl_bit_manip! { u8x32 , u8, i8x32, i8, ctpop_u8x32, ctlz_u8x32, cttz_u8x32 }
+impl_bit_manip! { u8x64 , u8, i8x64, i8, ctpop_u8x64, ctlz_u8x64, cttz_u8x64 }
+impl_bit_manip! { u16x2 , u16, i16x2, i16, ctpop_u16x2, ctlz_u16x2, cttz_u16x2 }
+impl_bit_manip! { u16x4 , u16, i16x4, i16, ctpop_u16x4, ctlz_u16x4, cttz_u16x4 }
+impl_bit_manip! { u16x8 , u16, i16x8, i16, ctpop_u16x8, ctlz_u16x8, cttz_u16x8 }
+impl_bit_manip! { u16x16 , u16, i16x16, i16, ctpop_u16x16, ctlz_u16x16, cttz_u16x16 }
+impl_bit_manip! { u16x32 , u16, i16x32, i16, ctpop_u16x32, ctlz_u16x32, cttz_u16x32 }
+impl_bit_manip! { u32x2 , u32, i32x2, i32, ctpop_u32x2, ctlz_u32x2, cttz_u32x2 }
+impl_bit_manip! { u32x4 , u32, i32x4, i32, ctpop_u32x4, ctlz_u32x4, cttz_u32x4 }
+impl_bit_manip! { u32x8 , u32, i32x8, i32, ctpop_u32x8, ctlz_u32x8, cttz_u32x8 }
+impl_bit_manip! { u32x16 , u32, i32x16, i32, ctpop_u32x16, ctlz_u32x16, cttz_u32x16 }
+impl_bit_manip! { u64x2 , u64, i64x2, i64, ctpop_u64x2, ctlz_u64x2, cttz_u64x2 }
+impl_bit_manip! { u64x4 , u64, i64x4, i64, ctpop_u64x4, ctlz_u64x4, cttz_u64x4 }
+impl_bit_manip! { u64x8 , u64, i64x8, i64, ctpop_u64x8, ctlz_u64x8, cttz_u64x8 }
+impl_bit_manip! { u128x1 , u128, i128x1, i128, ctpop_u128x1, ctlz_u128x1, cttz_u128x1 }
+impl_bit_manip! { u128x2 , u128, i128x2, i128, ctpop_u128x2, ctlz_u128x2, cttz_u128x2 }
+impl_bit_manip! { u128x4 , u128, i128x4, i128, ctpop_u128x4, ctlz_u128x4, cttz_u128x4 }
+
+#[cfg(target_arch = "aarch64")]
+impl BitManip for u8x8 {
+ #[inline]
+ fn ctpop(self) -> Self {
+ let y: u8x8 = self.cast();
+ unsafe { ctpop_u8x8(y).cast() }
+ }
+
+ #[inline]
+ fn ctlz(self) -> Self {
+ let y: u8x8 = self.cast();
+ unsafe { ctlz_u8x8(y, false).cast() }
+ }
+
+ #[inline]
+ fn cttz(self) -> Self {
+ // FIXME: LLVM cttz.v8i8 broken on aarch64 https://github.com/rust-lang-nursery/packed_simd/issues/191
+ // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64
+ // intrinsics
+ let mut tz = self;
+ for i in 0..Self::lanes() {
+ tz = tz.replace(i, self.extract(i).trailing_zeros() as u8);
+ }
+ tz
+ }
+}
+#[cfg(target_arch = "aarch64")]
+impl BitManip for i8x8 {
+ #[inline]
+ fn ctpop(self) -> Self {
+ let y: u8x8 = self.cast();
+ unsafe { ctpop_u8x8(y).cast() }
+ }
+
+ #[inline]
+ fn ctlz(self) -> Self {
+ let y: u8x8 = self.cast();
+ unsafe { ctlz_u8x8(y, false).cast() }
+ }
+
+ #[inline]
+ fn cttz(self) -> Self {
+ // FIXME: LLVM cttz.v8i8 broken on aarch64 https://github.com/rust-lang-nursery/packed_simd/issues/191
+ // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64
+ // intrinsics
+ let mut tz = self;
+ for i in 0..Self::lanes() {
+ tz = tz.replace(i, self.extract(i).trailing_zeros() as i8);
+ }
+ tz
+ }
+}
+
+cfg_if! {
+ if #[cfg(target_pointer_width = "8")] {
+ impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u8x2 }
+ impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u8x4 }
+ impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u8x8 }
+ } else if #[cfg(target_pointer_width = "16")] {
+ impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u16x2 }
+ impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u16x4 }
+ impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u16x8 }
+ } else if #[cfg(target_pointer_width = "32")] {
+ impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u32x2 }
+ impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u32x4 }
+ impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u32x8 }
+ } else if #[cfg(target_pointer_width = "64")] {
+ impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u64x2 }
+ impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u64x4 }
+ impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u64x8 }
+ } else {
+ compile_error!("unsupported target_pointer_width");
+ }
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/llvm.rs b/third_party/rust/packed_simd_2/src/codegen/llvm.rs
new file mode 100644
index 0000000000..b4c09849bc
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/llvm.rs
@@ -0,0 +1,128 @@
+//! LLVM's platform intrinsics
+#![allow(dead_code)]
+
+use crate::sealed::Shuffle;
+#[allow(unused_imports)] // FIXME: spurious warning?
+use crate::sealed::Simd;
+
+// Shuffle intrinsics: expanded in users' crates, therefore public.
+extern "platform-intrinsic" {
+ pub fn simd_shuffle2<T, U>(x: T, y: T, idx: [u32; 2]) -> U;
+ pub fn simd_shuffle4<T, U>(x: T, y: T, idx: [u32; 4]) -> U;
+ pub fn simd_shuffle8<T, U>(x: T, y: T, idx: [u32; 8]) -> U;
+ pub fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U;
+ pub fn simd_shuffle32<T, U>(x: T, y: T, idx: [u32; 32]) -> U;
+ pub fn simd_shuffle64<T, U>(x: T, y: T, idx: [u32; 64]) -> U;
+}
+
+#[allow(clippy::missing_safety_doc)]
+#[inline]
+pub unsafe fn __shuffle_vector2<const IDX: [u32; 2], T, U>(x: T, y: T) -> U
+where
+ T: Simd,
+ <T as Simd>::Element: Shuffle<[u32; 2], Output = U>,
+{
+ simd_shuffle2(x, y, IDX)
+}
+
+#[allow(clippy::missing_safety_doc)]
+#[inline]
+pub unsafe fn __shuffle_vector4<const IDX: [u32; 4], T, U>(x: T, y: T) -> U
+where
+ T: Simd,
+ <T as Simd>::Element: Shuffle<[u32; 4], Output = U>,
+{
+ simd_shuffle4(x, y, IDX)
+}
+
+#[allow(clippy::missing_safety_doc)]
+#[inline]
+pub unsafe fn __shuffle_vector8<const IDX: [u32; 8], T, U>(x: T, y: T) -> U
+where
+ T: Simd,
+ <T as Simd>::Element: Shuffle<[u32; 8], Output = U>,
+{
+ simd_shuffle8(x, y, IDX)
+}
+
+#[allow(clippy::missing_safety_doc)]
+#[inline]
+pub unsafe fn __shuffle_vector16<const IDX: [u32; 16], T, U>(x: T, y: T) -> U
+where
+ T: Simd,
+ <T as Simd>::Element: Shuffle<[u32; 16], Output = U>,
+{
+ simd_shuffle16(x, y, IDX)
+}
+
+#[allow(clippy::missing_safety_doc)]
+#[inline]
+pub unsafe fn __shuffle_vector32<const IDX: [u32; 32], T, U>(x: T, y: T) -> U
+where
+ T: Simd,
+ <T as Simd>::Element: Shuffle<[u32; 32], Output = U>,
+{
+ simd_shuffle32(x, y, IDX)
+}
+
+#[allow(clippy::missing_safety_doc)]
+#[inline]
+pub unsafe fn __shuffle_vector64<const IDX: [u32; 64], T, U>(x: T, y: T) -> U
+where
+ T: Simd,
+ <T as Simd>::Element: Shuffle<[u32; 64], Output = U>,
+{
+ simd_shuffle64(x, y, IDX)
+}
+
+extern "platform-intrinsic" {
+ pub(crate) fn simd_eq<T, U>(x: T, y: T) -> U;
+ pub(crate) fn simd_ne<T, U>(x: T, y: T) -> U;
+ pub(crate) fn simd_lt<T, U>(x: T, y: T) -> U;
+ pub(crate) fn simd_le<T, U>(x: T, y: T) -> U;
+ pub(crate) fn simd_gt<T, U>(x: T, y: T) -> U;
+ pub(crate) fn simd_ge<T, U>(x: T, y: T) -> U;
+
+ pub(crate) fn simd_insert<T, U>(x: T, idx: u32, val: U) -> T;
+ pub(crate) fn simd_extract<T, U>(x: T, idx: u32) -> U;
+
+ pub(crate) fn simd_cast<T, U>(x: T) -> U;
+
+ pub(crate) fn simd_add<T>(x: T, y: T) -> T;
+ pub(crate) fn simd_sub<T>(x: T, y: T) -> T;
+ pub(crate) fn simd_mul<T>(x: T, y: T) -> T;
+ pub(crate) fn simd_div<T>(x: T, y: T) -> T;
+ pub(crate) fn simd_rem<T>(x: T, y: T) -> T;
+ pub(crate) fn simd_shl<T>(x: T, y: T) -> T;
+ pub(crate) fn simd_shr<T>(x: T, y: T) -> T;
+ pub(crate) fn simd_and<T>(x: T, y: T) -> T;
+ pub(crate) fn simd_or<T>(x: T, y: T) -> T;
+ pub(crate) fn simd_xor<T>(x: T, y: T) -> T;
+
+ pub(crate) fn simd_reduce_add_unordered<T, U>(x: T) -> U;
+ pub(crate) fn simd_reduce_mul_unordered<T, U>(x: T) -> U;
+ pub(crate) fn simd_reduce_add_ordered<T, U>(x: T, acc: U) -> U;
+ pub(crate) fn simd_reduce_mul_ordered<T, U>(x: T, acc: U) -> U;
+ pub(crate) fn simd_reduce_min<T, U>(x: T) -> U;
+ pub(crate) fn simd_reduce_max<T, U>(x: T) -> U;
+ pub(crate) fn simd_reduce_min_nanless<T, U>(x: T) -> U;
+ pub(crate) fn simd_reduce_max_nanless<T, U>(x: T) -> U;
+ pub(crate) fn simd_reduce_and<T, U>(x: T) -> U;
+ pub(crate) fn simd_reduce_or<T, U>(x: T) -> U;
+ pub(crate) fn simd_reduce_xor<T, U>(x: T) -> U;
+ pub(crate) fn simd_reduce_all<T>(x: T) -> bool;
+ pub(crate) fn simd_reduce_any<T>(x: T) -> bool;
+
+ pub(crate) fn simd_select<M, T>(m: M, a: T, b: T) -> T;
+
+ pub(crate) fn simd_fmin<T>(a: T, b: T) -> T;
+ pub(crate) fn simd_fmax<T>(a: T, b: T) -> T;
+
+ pub(crate) fn simd_fsqrt<T>(a: T) -> T;
+ pub(crate) fn simd_fma<T>(a: T, b: T, c: T) -> T;
+
+ pub(crate) fn simd_gather<T, P, M>(value: T, pointers: P, mask: M) -> T;
+ pub(crate) fn simd_scatter<T, P, M>(value: T, pointers: P, mask: M);
+
+ pub(crate) fn simd_bitmask<T, U>(value: T) -> U;
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/math.rs b/third_party/rust/packed_simd_2/src/codegen/math.rs
new file mode 100644
index 0000000000..9a0ea7a4e2
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/math.rs
@@ -0,0 +1,3 @@
+//! Vertical math operations
+
+pub(crate) mod float;
diff --git a/third_party/rust/packed_simd_2/src/codegen/math/float.rs b/third_party/rust/packed_simd_2/src/codegen/math/float.rs
new file mode 100644
index 0000000000..ffbf18bfe9
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/math/float.rs
@@ -0,0 +1,18 @@
+//! Vertical floating-point math operations.
+#![allow(clippy::useless_transmute)]
+
+#[macro_use]
+pub(crate) mod macros;
+pub(crate) mod abs;
+pub(crate) mod cos;
+pub(crate) mod cos_pi;
+pub(crate) mod exp;
+pub(crate) mod ln;
+pub(crate) mod mul_add;
+pub(crate) mod mul_adde;
+pub(crate) mod powf;
+pub(crate) mod sin;
+pub(crate) mod sin_cos_pi;
+pub(crate) mod sin_pi;
+pub(crate) mod sqrt;
+pub(crate) mod sqrte;
diff --git a/third_party/rust/packed_simd_2/src/codegen/math/float/abs.rs b/third_party/rust/packed_simd_2/src/codegen/math/float/abs.rs
new file mode 100644
index 0000000000..34aacc25be
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/math/float/abs.rs
@@ -0,0 +1,103 @@
+//! Vertical floating-point `fabs`
+#![allow(unused)]
+
+// FIXME 64-bit 1 elem vectors fabs
+
+use crate::*;
+
+pub(crate) trait Abs {
+ fn abs(self) -> Self;
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+ #[link_name = "llvm.fabs.v2f32"]
+ fn fabs_v2f32(x: f32x2) -> f32x2;
+ #[link_name = "llvm.fabs.v4f32"]
+ fn fabs_v4f32(x: f32x4) -> f32x4;
+ #[link_name = "llvm.fabs.v8f32"]
+ fn fabs_v8f32(x: f32x8) -> f32x8;
+ #[link_name = "llvm.fabs.v16f32"]
+ fn fabs_v16f32(x: f32x16) -> f32x16;
+ /* FIXME 64-bit fabsgle elem vectors
+ #[link_name = "llvm.fabs.v1f64"]
+ fn fabs_v1f64(x: f64x1) -> f64x1;
+ */
+ #[link_name = "llvm.fabs.v2f64"]
+ fn fabs_v2f64(x: f64x2) -> f64x2;
+ #[link_name = "llvm.fabs.v4f64"]
+ fn fabs_v4f64(x: f64x4) -> f64x4;
+ #[link_name = "llvm.fabs.v8f64"]
+ fn fabs_v8f64(x: f64x8) -> f64x8;
+
+ #[link_name = "llvm.fabs.f32"]
+ fn fabs_f32(x: f32) -> f32;
+ #[link_name = "llvm.fabs.f64"]
+ fn fabs_f64(x: f64) -> f64;
+}
+
+gen_unary_impl_table!(Abs, abs);
+
+cfg_if! {
+ if #[cfg(target_arch = "s390x")] {
+ // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+ impl_unary!(f32x2[f32; 2]: fabs_f32);
+ impl_unary!(f32x4[f32; 4]: fabs_f32);
+ impl_unary!(f32x8[f32; 8]: fabs_f32);
+ impl_unary!(f32x16[f32; 16]: fabs_f32);
+
+ impl_unary!(f64x2[f64; 2]: fabs_f64);
+ impl_unary!(f64x4[f64; 4]: fabs_f64);
+ impl_unary!(f64x8[f64; 8]: fabs_f64);
+ } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+ use sleef_sys::*;
+ cfg_if! {
+ if #[cfg(target_feature = "avx2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_fabsf4_avx2128);
+ impl_unary!(f32x16[h => f32x8]: Sleef_fabsf8_avx2);
+ impl_unary!(f64x8[h => f64x4]: Sleef_fabsd4_avx2);
+
+ impl_unary!(f32x4: Sleef_fabsf4_avx2128);
+ impl_unary!(f32x8: Sleef_fabsf8_avx2);
+ impl_unary!(f64x2: Sleef_fabsd2_avx2128);
+ impl_unary!(f64x4: Sleef_fabsd4_avx2);
+ } else if #[cfg(target_feature = "avx")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_fabsf4_sse4);
+ impl_unary!(f32x16[h => f32x8]: Sleef_fabsf8_avx);
+ impl_unary!(f64x8[h => f64x4]: Sleef_fabsd4_avx);
+
+ impl_unary!(f32x4: Sleef_fabsf4_sse4);
+ impl_unary!(f32x8: Sleef_fabsf8_avx);
+ impl_unary!(f64x2: Sleef_fabsd2_sse4);
+ impl_unary!(f64x4: Sleef_fabsd4_avx);
+ } else if #[cfg(target_feature = "sse4.2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_fabsf4_sse4);
+ impl_unary!(f32x16[q => f32x4]: Sleef_fabsf4_sse4);
+ impl_unary!(f64x8[q => f64x2]: Sleef_fabsd2_sse4);
+
+ impl_unary!(f32x4: Sleef_fabsf4_sse4);
+ impl_unary!(f32x8[h => f32x4]: Sleef_fabsf4_sse4);
+ impl_unary!(f64x2: Sleef_fabsd2_sse4);
+ impl_unary!(f64x4[h => f64x2]: Sleef_fabsd2_sse4);
+ } else {
+ impl_unary!(f32x2[f32; 2]: fabs_f32);
+ impl_unary!(f32x16: fabs_v16f32);
+ impl_unary!(f64x8: fabs_v8f64);
+
+ impl_unary!(f32x4: fabs_v4f32);
+ impl_unary!(f32x8: fabs_v8f32);
+ impl_unary!(f64x2: fabs_v2f64);
+ impl_unary!(f64x4: fabs_v4f64);
+ }
+ }
+ } else {
+ impl_unary!(f32x2[f32; 2]: fabs_f32);
+ impl_unary!(f32x4: fabs_v4f32);
+ impl_unary!(f32x8: fabs_v8f32);
+ impl_unary!(f32x16: fabs_v16f32);
+
+ impl_unary!(f64x2: fabs_v2f64);
+ impl_unary!(f64x4: fabs_v4f64);
+ impl_unary!(f64x8: fabs_v8f64);
+ }
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/math/float/cos.rs b/third_party/rust/packed_simd_2/src/codegen/math/float/cos.rs
new file mode 100644
index 0000000000..dec390cb74
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/math/float/cos.rs
@@ -0,0 +1,103 @@
+//! Vertical floating-point `cos`
+#![allow(unused)]
+
+// FIXME 64-bit 1 elem vector cos
+
+use crate::*;
+
+pub(crate) trait Cos {
+ fn cos(self) -> Self;
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+ #[link_name = "llvm.cos.v2f32"]
+ fn cos_v2f32(x: f32x2) -> f32x2;
+ #[link_name = "llvm.cos.v4f32"]
+ fn cos_v4f32(x: f32x4) -> f32x4;
+ #[link_name = "llvm.cos.v8f32"]
+ fn cos_v8f32(x: f32x8) -> f32x8;
+ #[link_name = "llvm.cos.v16f32"]
+ fn cos_v16f32(x: f32x16) -> f32x16;
+ /* FIXME 64-bit cosgle elem vectors
+ #[link_name = "llvm.cos.v1f64"]
+ fn cos_v1f64(x: f64x1) -> f64x1;
+ */
+ #[link_name = "llvm.cos.v2f64"]
+ fn cos_v2f64(x: f64x2) -> f64x2;
+ #[link_name = "llvm.cos.v4f64"]
+ fn cos_v4f64(x: f64x4) -> f64x4;
+ #[link_name = "llvm.cos.v8f64"]
+ fn cos_v8f64(x: f64x8) -> f64x8;
+
+ #[link_name = "llvm.cos.f32"]
+ fn cos_f32(x: f32) -> f32;
+ #[link_name = "llvm.cos.f64"]
+ fn cos_f64(x: f64) -> f64;
+}
+
+gen_unary_impl_table!(Cos, cos);
+
+cfg_if! {
+ if #[cfg(target_arch = "s390x")] {
+ // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+ impl_unary!(f32x2[f32; 2]: cos_f32);
+ impl_unary!(f32x4[f32; 4]: cos_f32);
+ impl_unary!(f32x8[f32; 8]: cos_f32);
+ impl_unary!(f32x16[f32; 16]: cos_f32);
+
+ impl_unary!(f64x2[f64; 2]: cos_f64);
+ impl_unary!(f64x4[f64; 4]: cos_f64);
+ impl_unary!(f64x8[f64; 8]: cos_f64);
+ } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+ use sleef_sys::*;
+ cfg_if! {
+ if #[cfg(target_feature = "avx2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_cosf4_u10avx2128);
+ impl_unary!(f32x16[h => f32x8]: Sleef_cosf8_u10avx2);
+ impl_unary!(f64x8[h => f64x4]: Sleef_cosd4_u10avx2);
+
+ impl_unary!(f32x4: Sleef_cosf4_u10avx2128);
+ impl_unary!(f32x8: Sleef_cosf8_u10avx2);
+ impl_unary!(f64x2: Sleef_cosd2_u10avx2128);
+ impl_unary!(f64x4: Sleef_cosd4_u10avx2);
+ } else if #[cfg(target_feature = "avx")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_cosf4_u10sse4);
+ impl_unary!(f32x16[h => f32x8]: Sleef_cosf8_u10avx);
+ impl_unary!(f64x8[h => f64x4]: Sleef_cosd4_u10avx);
+
+ impl_unary!(f32x4: Sleef_cosf4_u10sse4);
+ impl_unary!(f32x8: Sleef_cosf8_u10avx);
+ impl_unary!(f64x2: Sleef_cosd2_u10sse4);
+ impl_unary!(f64x4: Sleef_cosd4_u10avx);
+ } else if #[cfg(target_feature = "sse4.2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_cosf4_u10sse4);
+ impl_unary!(f32x16[q => f32x4]: Sleef_cosf4_u10sse4);
+ impl_unary!(f64x8[q => f64x2]: Sleef_cosd2_u10sse4);
+
+ impl_unary!(f32x4: Sleef_cosf4_u10sse4);
+ impl_unary!(f32x8[h => f32x4]: Sleef_cosf4_u10sse4);
+ impl_unary!(f64x2: Sleef_cosd2_u10sse4);
+ impl_unary!(f64x4[h => f64x2]: Sleef_cosd2_u10sse4);
+ } else {
+ impl_unary!(f32x2[f32; 2]: cos_f32);
+ impl_unary!(f32x16: cos_v16f32);
+ impl_unary!(f64x8: cos_v8f64);
+
+ impl_unary!(f32x4: cos_v4f32);
+ impl_unary!(f32x8: cos_v8f32);
+ impl_unary!(f64x2: cos_v2f64);
+ impl_unary!(f64x4: cos_v4f64);
+ }
+ }
+ } else {
+ impl_unary!(f32x2[f32; 2]: cos_f32);
+ impl_unary!(f32x4: cos_v4f32);
+ impl_unary!(f32x8: cos_v8f32);
+ impl_unary!(f32x16: cos_v16f32);
+
+ impl_unary!(f64x2: cos_v2f64);
+ impl_unary!(f64x4: cos_v4f64);
+ impl_unary!(f64x8: cos_v8f64);
+ }
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/math/float/cos_pi.rs b/third_party/rust/packed_simd_2/src/codegen/math/float/cos_pi.rs
new file mode 100644
index 0000000000..e283280ee4
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/math/float/cos_pi.rs
@@ -0,0 +1,87 @@
+//! Vertical floating-point `cos`
+#![allow(unused)]
+
+// FIXME 64-bit 1 elem vectors cos_pi
+
+use crate::*;
+
+pub(crate) trait CosPi {
+ fn cos_pi(self) -> Self;
+}
+
+gen_unary_impl_table!(CosPi, cos_pi);
+
+macro_rules! impl_def {
+ ($vid:ident, $PI:path) => {
+ impl CosPi for $vid {
+ #[inline]
+ fn cos_pi(self) -> Self {
+ (self * Self::splat($PI)).cos()
+ }
+ }
+ };
+}
+macro_rules! impl_def32 {
+ ($vid:ident) => {
+ impl_def!($vid, crate::f32::consts::PI);
+ };
+}
+macro_rules! impl_def64 {
+ ($vid:ident) => {
+ impl_def!($vid, crate::f64::consts::PI);
+ };
+}
+
+cfg_if! {
+ if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+ use sleef_sys::*;
+ cfg_if! {
+ if #[cfg(target_feature = "avx2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_cospif4_u05avx2128);
+ impl_unary!(f32x16[h => f32x8]: Sleef_cospif8_u05avx2);
+ impl_unary!(f64x8[h => f64x4]: Sleef_cospid4_u05avx2);
+
+ impl_unary!(f32x4: Sleef_cospif4_u05avx2128);
+ impl_unary!(f32x8: Sleef_cospif8_u05avx2);
+ impl_unary!(f64x2: Sleef_cospid2_u05avx2128);
+ impl_unary!(f64x4: Sleef_cospid4_u05avx2);
+ } else if #[cfg(target_feature = "avx")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_cospif4_u05sse4);
+ impl_unary!(f32x16[h => f32x8]: Sleef_cospif8_u05avx);
+ impl_unary!(f64x8[h => f64x4]: Sleef_cospid4_u05avx);
+
+ impl_unary!(f32x4: Sleef_cospif4_u05sse4);
+ impl_unary!(f32x8: Sleef_cospif8_u05avx);
+ impl_unary!(f64x2: Sleef_cospid2_u05sse4);
+ impl_unary!(f64x4: Sleef_cospid4_u05avx);
+ } else if #[cfg(target_feature = "sse4.2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_cospif4_u05sse4);
+ impl_unary!(f32x16[q => f32x4]: Sleef_cospif4_u05sse4);
+ impl_unary!(f64x8[q => f64x2]: Sleef_cospid2_u05sse4);
+
+ impl_unary!(f32x4: Sleef_cospif4_u05sse4);
+ impl_unary!(f32x8[h => f32x4]: Sleef_cospif4_u05sse4);
+ impl_unary!(f64x2: Sleef_cospid2_u05sse4);
+ impl_unary!(f64x4[h => f64x2]: Sleef_cospid2_u05sse4);
+ } else {
+ impl_def32!(f32x2);
+ impl_def32!(f32x4);
+ impl_def32!(f32x8);
+ impl_def32!(f32x16);
+
+ impl_def64!(f64x2);
+ impl_def64!(f64x4);
+ impl_def64!(f64x8);
+ }
+ }
+ } else {
+ impl_def32!(f32x2);
+ impl_def32!(f32x4);
+ impl_def32!(f32x8);
+ impl_def32!(f32x16);
+
+ impl_def64!(f64x2);
+ impl_def64!(f64x4);
+ impl_def64!(f64x8);
+ }
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/math/float/exp.rs b/third_party/rust/packed_simd_2/src/codegen/math/float/exp.rs
new file mode 100644
index 0000000000..a7b20580e3
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/math/float/exp.rs
@@ -0,0 +1,112 @@
+//! Vertical floating-point `exp`
+#![allow(unused)]
+
+// FIXME 64-bit expgle elem vectors misexpg
+
+use crate::*;
+
+pub(crate) trait Exp {
+ fn exp(self) -> Self;
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+ #[link_name = "llvm.exp.v2f32"]
+ fn exp_v2f32(x: f32x2) -> f32x2;
+ #[link_name = "llvm.exp.v4f32"]
+ fn exp_v4f32(x: f32x4) -> f32x4;
+ #[link_name = "llvm.exp.v8f32"]
+ fn exp_v8f32(x: f32x8) -> f32x8;
+ #[link_name = "llvm.exp.v16f32"]
+ fn exp_v16f32(x: f32x16) -> f32x16;
+ /* FIXME 64-bit expgle elem vectors
+ #[link_name = "llvm.exp.v1f64"]
+ fn exp_v1f64(x: f64x1) -> f64x1;
+ */
+ #[link_name = "llvm.exp.v2f64"]
+ fn exp_v2f64(x: f64x2) -> f64x2;
+ #[link_name = "llvm.exp.v4f64"]
+ fn exp_v4f64(x: f64x4) -> f64x4;
+ #[link_name = "llvm.exp.v8f64"]
+ fn exp_v8f64(x: f64x8) -> f64x8;
+
+ #[link_name = "llvm.exp.f32"]
+ fn exp_f32(x: f32) -> f32;
+ #[link_name = "llvm.exp.f64"]
+ fn exp_f64(x: f64) -> f64;
+}
+
+gen_unary_impl_table!(Exp, exp);
+
+cfg_if! {
+ if #[cfg(target_arch = "s390x")] {
+ // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+ impl_unary!(f32x2[f32; 2]: exp_f32);
+ impl_unary!(f32x4[f32; 4]: exp_f32);
+ impl_unary!(f32x8[f32; 8]: exp_f32);
+ impl_unary!(f32x16[f32; 16]: exp_f32);
+
+ impl_unary!(f64x2[f64; 2]: exp_f64);
+ impl_unary!(f64x4[f64; 4]: exp_f64);
+ impl_unary!(f64x8[f64; 8]: exp_f64);
+ } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+ use sleef_sys::*;
+ cfg_if! {
+ if #[cfg(target_feature = "avx2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10avx2128);
+ impl_unary!(f32x16[h => f32x8]: Sleef_expf8_u10avx2);
+ impl_unary!(f64x8[h => f64x4]: Sleef_expd4_u10avx2);
+
+ impl_unary!(f32x4: Sleef_expf4_u10avx2128);
+ impl_unary!(f32x8: Sleef_expf8_u10avx2);
+ impl_unary!(f64x2: Sleef_expd2_u10avx2128);
+ impl_unary!(f64x4: Sleef_expd4_u10avx2);
+ } else if #[cfg(target_feature = "avx")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10sse4);
+ impl_unary!(f32x16[h => f32x8]: Sleef_expf8_u10avx);
+ impl_unary!(f64x8[h => f64x4]: Sleef_expd4_u10avx);
+
+ impl_unary!(f32x4: Sleef_expf4_u10sse4);
+ impl_unary!(f32x8: Sleef_expf8_u10avx);
+ impl_unary!(f64x2: Sleef_expd2_u10sse4);
+ impl_unary!(f64x4: Sleef_expd4_u10avx);
+ } else if #[cfg(target_feature = "sse4.2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10sse4);
+ impl_unary!(f32x16[q => f32x4]: Sleef_expf4_u10sse4);
+ impl_unary!(f64x8[q => f64x2]: Sleef_expd2_u10sse4);
+
+ impl_unary!(f32x4: Sleef_expf4_u10sse4);
+ impl_unary!(f32x8[h => f32x4]: Sleef_expf4_u10sse4);
+ impl_unary!(f64x2: Sleef_expd2_u10sse4);
+ impl_unary!(f64x4[h => f64x2]: Sleef_expd2_u10sse4);
+ } else if #[cfg(target_feature = "sse2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10sse2);
+ impl_unary!(f32x16[q => f32x4]: Sleef_expf4_u10sse2);
+ impl_unary!(f64x8[q => f64x2]: Sleef_expd2_u10sse2);
+
+ impl_unary!(f32x4: Sleef_expf4_u10sse2);
+ impl_unary!(f32x8[h => f32x4]: Sleef_expf4_u10sse2);
+ impl_unary!(f64x2: Sleef_expd2_u10sse2);
+ impl_unary!(f64x4[h => f64x2]: Sleef_expd2_u10sse2);
+ } else {
+ impl_unary!(f32x2[f32; 2]: exp_f32);
+ impl_unary!(f32x16: exp_v16f32);
+ impl_unary!(f64x8: exp_v8f64);
+
+ impl_unary!(f32x4: exp_v4f32);
+ impl_unary!(f32x8: exp_v8f32);
+ impl_unary!(f64x2: exp_v2f64);
+ impl_unary!(f64x4: exp_v4f64);
+ }
+ }
+ } else {
+ impl_unary!(f32x2[f32; 2]: exp_f32);
+ impl_unary!(f32x4: exp_v4f32);
+ impl_unary!(f32x8: exp_v8f32);
+ impl_unary!(f32x16: exp_v16f32);
+
+ impl_unary!(f64x2: exp_v2f64);
+ impl_unary!(f64x4: exp_v4f64);
+ impl_unary!(f64x8: exp_v8f64);
+ }
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/math/float/ln.rs b/third_party/rust/packed_simd_2/src/codegen/math/float/ln.rs
new file mode 100644
index 0000000000..a5e38cb40d
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/math/float/ln.rs
@@ -0,0 +1,112 @@
+//! Vertical floating-point `ln`
+#![allow(unused)]
+
+// FIXME 64-bit lngle elem vectors mislng
+
+use crate::*;
+
+pub(crate) trait Ln {
+ fn ln(self) -> Self;
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+ #[link_name = "llvm.log.v2f32"]
+ fn ln_v2f32(x: f32x2) -> f32x2;
+ #[link_name = "llvm.log.v4f32"]
+ fn ln_v4f32(x: f32x4) -> f32x4;
+ #[link_name = "llvm.log.v8f32"]
+ fn ln_v8f32(x: f32x8) -> f32x8;
+ #[link_name = "llvm.log.v16f32"]
+ fn ln_v16f32(x: f32x16) -> f32x16;
+ /* FIXME 64-bit lngle elem vectors
+ #[link_name = "llvm.log.v1f64"]
+ fn ln_v1f64(x: f64x1) -> f64x1;
+ */
+ #[link_name = "llvm.log.v2f64"]
+ fn ln_v2f64(x: f64x2) -> f64x2;
+ #[link_name = "llvm.log.v4f64"]
+ fn ln_v4f64(x: f64x4) -> f64x4;
+ #[link_name = "llvm.log.v8f64"]
+ fn ln_v8f64(x: f64x8) -> f64x8;
+
+ #[link_name = "llvm.log.f32"]
+ fn ln_f32(x: f32) -> f32;
+ #[link_name = "llvm.log.f64"]
+ fn ln_f64(x: f64) -> f64;
+}
+
+gen_unary_impl_table!(Ln, ln);
+
+cfg_if! {
+ if #[cfg(target_arch = "s390x")] {
+ // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+ impl_unary!(f32x2[f32; 2]: ln_f32);
+ impl_unary!(f32x4[f32; 4]: ln_f32);
+ impl_unary!(f32x8[f32; 8]: ln_f32);
+ impl_unary!(f32x16[f32; 16]: ln_f32);
+
+ impl_unary!(f64x2[f64; 2]: ln_f64);
+ impl_unary!(f64x4[f64; 4]: ln_f64);
+ impl_unary!(f64x8[f64; 8]: ln_f64);
+ } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+ use sleef_sys::*;
+ cfg_if! {
+ if #[cfg(target_feature = "avx2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10avx2128);
+ impl_unary!(f32x16[h => f32x8]: Sleef_logf8_u10avx2);
+ impl_unary!(f64x8[h => f64x4]: Sleef_logd4_u10avx2);
+
+ impl_unary!(f32x4: Sleef_logf4_u10avx2128);
+ impl_unary!(f32x8: Sleef_logf8_u10avx2);
+ impl_unary!(f64x2: Sleef_logd2_u10avx2128);
+ impl_unary!(f64x4: Sleef_logd4_u10avx2);
+ } else if #[cfg(target_feature = "avx")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10sse4);
+ impl_unary!(f32x16[h => f32x8]: Sleef_logf8_u10avx);
+ impl_unary!(f64x8[h => f64x4]: Sleef_logd4_u10avx);
+
+ impl_unary!(f32x4: Sleef_logf4_u10sse4);
+ impl_unary!(f32x8: Sleef_logf8_u10avx);
+ impl_unary!(f64x2: Sleef_logd2_u10sse4);
+ impl_unary!(f64x4: Sleef_logd4_u10avx);
+ } else if #[cfg(target_feature = "sse4.2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10sse4);
+ impl_unary!(f32x16[q => f32x4]: Sleef_logf4_u10sse4);
+ impl_unary!(f64x8[q => f64x2]: Sleef_logd2_u10sse4);
+
+ impl_unary!(f32x4: Sleef_logf4_u10sse4);
+ impl_unary!(f32x8[h => f32x4]: Sleef_logf4_u10sse4);
+ impl_unary!(f64x2: Sleef_logd2_u10sse4);
+ impl_unary!(f64x4[h => f64x2]: Sleef_logd2_u10sse4);
+ } else if #[cfg(target_feature = "sse2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10sse2);
+ impl_unary!(f32x16[q => f32x4]: Sleef_logf4_u10sse2);
+ impl_unary!(f64x8[q => f64x2]: Sleef_logd2_u10sse2);
+
+ impl_unary!(f32x4: Sleef_logf4_u10sse2);
+ impl_unary!(f32x8[h => f32x4]: Sleef_logf4_u10sse2);
+ impl_unary!(f64x2: Sleef_logd2_u10sse2);
+ impl_unary!(f64x4[h => f64x2]: Sleef_logd2_u10sse2);
+ } else {
+ impl_unary!(f32x2[f32; 2]: ln_f32);
+ impl_unary!(f32x16: ln_v16f32);
+ impl_unary!(f64x8: ln_v8f64);
+
+ impl_unary!(f32x4: ln_v4f32);
+ impl_unary!(f32x8: ln_v8f32);
+ impl_unary!(f64x2: ln_v2f64);
+ impl_unary!(f64x4: ln_v4f64);
+ }
+ }
+ } else {
+ impl_unary!(f32x2[f32; 2]: ln_f32);
+ impl_unary!(f32x4: ln_v4f32);
+ impl_unary!(f32x8: ln_v8f32);
+ impl_unary!(f32x16: ln_v16f32);
+
+ impl_unary!(f64x2: ln_v2f64);
+ impl_unary!(f64x4: ln_v4f64);
+ impl_unary!(f64x8: ln_v8f64);
+ }
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/math/float/macros.rs b/third_party/rust/packed_simd_2/src/codegen/math/float/macros.rs
new file mode 100644
index 0000000000..8daee1afe2
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/math/float/macros.rs
@@ -0,0 +1,470 @@
+//! Utility macros
+#![allow(unused)]
+
+macro_rules! impl_unary_ {
+ // implementation mapping 1:1
+ (vec | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+ $fun:ident) => {
+ impl $trait_id for $vec_id {
+ #[inline]
+ fn $trait_method(self) -> Self {
+ unsafe {
+ use crate::mem::transmute;
+ transmute($fun(transmute(self)))
+ }
+ }
+ }
+ };
+ // implementation mapping 1:1 for when `$fun` is a generic function
+ // like some of the fp math rustc intrinsics (e.g. `fn fun<T>(x: T) -> T`).
+ (gen | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+ $fun:ident) => {
+ impl $trait_id for $vec_id {
+ #[inline]
+ fn $trait_method(self) -> Self {
+ unsafe {
+ use crate::mem::transmute;
+ transmute($fun(self.0))
+ }
+ }
+ }
+ };
+ (scalar | $trait_id:ident, $trait_method:ident,
+ $vec_id:ident, [$sid:ident; $scount:expr], $fun:ident) => {
+ impl $trait_id for $vec_id {
+ #[inline]
+ fn $trait_method(self) -> Self {
+ unsafe {
+ union U {
+ vec: $vec_id,
+ scalars: [$sid; $scount],
+ }
+ let mut scalars = U { vec: self }.scalars;
+ for i in &mut scalars {
+ *i = $fun(*i);
+ }
+ U { scalars }.vec
+ }
+ }
+ }
+ };
+ // implementation calling fun twice on each of the vector halves:
+ (halves | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+ $vech_id:ident, $fun:ident) => {
+ impl $trait_id for $vec_id {
+ #[inline]
+ fn $trait_method(self) -> Self {
+ unsafe {
+ use crate::mem::transmute;
+ union U {
+ vec: $vec_id,
+ halves: [$vech_id; 2],
+ }
+
+ let mut halves = U { vec: self }.halves;
+
+ *halves.get_unchecked_mut(0) = transmute($fun(transmute(*halves.get_unchecked(0))));
+ *halves.get_unchecked_mut(1) = transmute($fun(transmute(*halves.get_unchecked(1))));
+
+ U { halves }.vec
+ }
+ }
+ }
+ };
+ // implementation calling fun four times on each of the vector quarters:
+ (quarter | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+ $vecq_id:ident, $fun:ident) => {
+ impl $trait_id for $vec_id {
+ #[inline]
+ fn $trait_method(self) -> Self {
+ unsafe {
+ use crate::mem::transmute;
+ union U {
+ vec: $vec_id,
+ quarters: [$vecq_id; 4],
+ }
+
+ let mut quarters = U { vec: self }.quarters;
+
+ *quarters.get_unchecked_mut(0) = transmute($fun(transmute(*quarters.get_unchecked(0))));
+ *quarters.get_unchecked_mut(1) = transmute($fun(transmute(*quarters.get_unchecked(1))));
+ *quarters.get_unchecked_mut(2) = transmute($fun(transmute(*quarters.get_unchecked(2))));
+ *quarters.get_unchecked_mut(3) = transmute($fun(transmute(*quarters.get_unchecked(3))));
+
+ U { quarters }.vec
+ }
+ }
+ }
+ };
+ // implementation calling fun once on a vector twice as large:
+ (twice | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+ $vect_id:ident, $fun:ident) => {
+ impl $trait_id for $vec_id {
+ #[inline]
+ fn $trait_method(self) -> Self {
+ unsafe {
+ use crate::mem::{transmute, uninitialized};
+
+ union U {
+ vec: [$vec_id; 2],
+ twice: $vect_id,
+ }
+
+ let twice = U { vec: [self, uninitialized()] }.twice;
+ let twice = transmute($fun(transmute(twice)));
+
+ *(U { twice }.vec.get_unchecked(0))
+ }
+ }
+ }
+ };
+}
+
+macro_rules! gen_unary_impl_table {
+ ($trait_id:ident, $trait_method:ident) => {
+ macro_rules! impl_unary {
+ ($vid:ident: $fun:ident) => {
+ impl_unary_!(vec | $trait_id, $trait_method, $vid, $fun);
+ };
+ ($vid:ident[g]: $fun:ident) => {
+ impl_unary_!(gen | $trait_id, $trait_method, $vid, $fun);
+ };
+ ($vid:ident[$sid:ident; $sc:expr]: $fun:ident) => {
+ impl_unary_!(scalar | $trait_id, $trait_method, $vid, [$sid; $sc], $fun);
+ };
+ ($vid:ident[s]: $fun:ident) => {
+ impl_unary_!(scalar | $trait_id, $trait_method, $vid, $fun);
+ };
+ ($vid:ident[h => $vid_h:ident]: $fun:ident) => {
+ impl_unary_!(halves | $trait_id, $trait_method, $vid, $vid_h, $fun);
+ };
+ ($vid:ident[q => $vid_q:ident]: $fun:ident) => {
+ impl_unary_!(quarter | $trait_id, $trait_method, $vid, $vid_q, $fun);
+ };
+ ($vid:ident[t => $vid_t:ident]: $fun:ident) => {
+ impl_unary_!(twice | $trait_id, $trait_method, $vid, $vid_t, $fun);
+ };
+ }
+ };
+}
+
+macro_rules! impl_tertiary_ {
+ // implementation mapping 1:1
+ (vec | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+ $fun:ident) => {
+ impl $trait_id for $vec_id {
+ #[inline]
+ fn $trait_method(self, y: Self, z: Self) -> Self {
+ unsafe {
+ use crate::mem::transmute;
+ transmute($fun(transmute(self), transmute(y), transmute(z)))
+ }
+ }
+ }
+ };
+ (scalar | $trait_id:ident, $trait_method:ident,
+ $vec_id:ident, [$sid:ident; $scount:expr], $fun:ident) => {
+ impl $trait_id for $vec_id {
+ #[inline]
+ fn $trait_method(self, y: Self, z: Self) -> Self {
+ unsafe {
+ union U {
+ vec: $vec_id,
+ scalars: [$sid; $scount],
+ }
+ let mut x = U { vec: self }.scalars;
+ let y = U { vec: y }.scalars;
+ let z = U { vec: z }.scalars;
+ for (x, (y, z)) in (&mut scalars).zip(&y).zip(&z) {
+ *i = $fun(*i, *y, *z);
+ }
+ U { vec: x }.vec
+ }
+ }
+ }
+ };
+ // implementation calling fun twice on each of the vector halves:
+ (halves | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+ $vech_id:ident, $fun:ident) => {
+ impl $trait_id for $vec_id {
+ #[inline]
+ fn $trait_method(self, y: Self, z: Self) -> Self {
+ unsafe {
+ use crate::mem::transmute;
+ union U {
+ vec: $vec_id,
+ halves: [$vech_id; 2],
+ }
+
+ let mut x_halves = U { vec: self }.halves;
+ let y_halves = U { vec: y }.halves;
+ let z_halves = U { vec: z }.halves;
+
+ *x_halves.get_unchecked_mut(0) = transmute($fun(
+ transmute(*x_halves.get_unchecked(0)),
+ transmute(*y_halves.get_unchecked(0)),
+ transmute(*z_halves.get_unchecked(0)),
+ ));
+ *x_halves.get_unchecked_mut(1) = transmute($fun(
+ transmute(*x_halves.get_unchecked(1)),
+ transmute(*y_halves.get_unchecked(1)),
+ transmute(*z_halves.get_unchecked(1)),
+ ));
+
+ U { halves: x_halves }.vec
+ }
+ }
+ }
+ };
+ // implementation calling fun four times on each of the vector quarters:
+ (quarter | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+ $vecq_id:ident, $fun:ident) => {
+ impl $trait_id for $vec_id {
+ #[inline]
+ fn $trait_method(self, y: Self, z: Self) -> Self {
+ unsafe {
+ use crate::mem::transmute;
+ union U {
+ vec: $vec_id,
+ quarters: [$vecq_id; 4],
+ }
+
+ let mut x_quarters = U { vec: self }.quarters;
+ let y_quarters = U { vec: y }.quarters;
+ let z_quarters = U { vec: z }.quarters;
+
+ *x_quarters.get_unchecked_mut(0) = transmute($fun(
+ transmute(*x_quarters.get_unchecked(0)),
+ transmute(*y_quarters.get_unchecked(0)),
+ transmute(*z_quarters.get_unchecked(0)),
+ ));
+
+ *x_quarters.get_unchecked_mut(1) = transmute($fun(
+ transmute(*x_quarters.get_unchecked(1)),
+ transmute(*y_quarters.get_unchecked(1)),
+ transmute(*z_quarters.get_unchecked(1)),
+ ));
+
+ *x_quarters.get_unchecked_mut(2) = transmute($fun(
+ transmute(*x_quarters.get_unchecked(2)),
+ transmute(*y_quarters.get_unchecked(2)),
+ transmute(*z_quarters.get_unchecked(2)),
+ ));
+
+ *x_quarters.get_unchecked_mut(3) = transmute($fun(
+ transmute(*x_quarters.get_unchecked(3)),
+ transmute(*y_quarters.get_unchecked(3)),
+ transmute(*z_quarters.get_unchecked(3)),
+ ));
+
+ U { quarters: x_quarters }.vec
+ }
+ }
+ }
+ };
+ // implementation calling fun once on a vector twice as large:
+ (twice | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+ $vect_id:ident, $fun:ident) => {
+ impl $trait_id for $vec_id {
+ #[inline]
+ fn $trait_method(self, y: Self, z: Self) -> Self {
+ unsafe {
+ use crate::mem::{transmute, uninitialized};
+
+ union U {
+ vec: [$vec_id; 2],
+ twice: $vect_id,
+ }
+
+ let x_twice = U { vec: [self, uninitialized()] }.twice;
+ let y_twice = U { vec: [y, uninitialized()] }.twice;
+ let z_twice = U { vec: [z, uninitialized()] }.twice;
+ let twice: $vect_id =
+ transmute($fun(transmute(x_twice), transmute(y_twice), transmute(z_twice)));
+
+ *(U { twice }.vec.get_unchecked(0))
+ }
+ }
+ }
+ };
+}
+
+macro_rules! gen_tertiary_impl_table {
+ ($trait_id:ident, $trait_method:ident) => {
+ macro_rules! impl_tertiary {
+ ($vid:ident: $fun:ident) => {
+ impl_tertiary_!(vec | $trait_id, $trait_method, $vid, $fun);
+ };
+ ($vid:ident[$sid:ident; $sc:expr]: $fun:ident) => {
+ impl_tertiary_!(scalar | $trait_id, $trait_method, $vid, [$sid; $sc], $fun);
+ };
+ ($vid:ident[s]: $fun:ident) => {
+ impl_tertiary_!(scalar | $trait_id, $trait_method, $vid, $fun);
+ };
+ ($vid:ident[h => $vid_h:ident]: $fun:ident) => {
+ impl_tertiary_!(halves | $trait_id, $trait_method, $vid, $vid_h, $fun);
+ };
+ ($vid:ident[q => $vid_q:ident]: $fun:ident) => {
+ impl_tertiary_!(quarter | $trait_id, $trait_method, $vid, $vid_q, $fun);
+ };
+ ($vid:ident[t => $vid_t:ident]: $fun:ident) => {
+ impl_tertiary_!(twice | $trait_id, $trait_method, $vid, $vid_t, $fun);
+ };
+ }
+ };
+}
+
+macro_rules! impl_binary_ {
+ // implementation mapping 1:1
+ (vec | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+ $fun:ident) => {
+ impl $trait_id for $vec_id {
+ #[inline]
+ fn $trait_method(self, y: Self) -> Self {
+ unsafe {
+ use crate::mem::transmute;
+ transmute($fun(transmute(self), transmute(y)))
+ }
+ }
+ }
+ };
+ (scalar | $trait_id:ident, $trait_method:ident,
+ $vec_id:ident, [$sid:ident; $scount:expr], $fun:ident) => {
+ impl $trait_id for $vec_id {
+ #[inline]
+ fn $trait_method(self, y: Self) -> Self {
+ unsafe {
+ union U {
+ vec: $vec_id,
+ scalars: [$sid; $scount],
+ }
+ let mut x = U { vec: self }.scalars;
+ let y = U { vec: y }.scalars;
+ for (x, y) in x.iter_mut().zip(&y) {
+ *x = $fun(*x, *y);
+ }
+ U { scalars: x }.vec
+ }
+ }
+ }
+ };
+ // implementation calling fun twice on each of the vector halves:
+ (halves | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+ $vech_id:ident, $fun:ident) => {
+ impl $trait_id for $vec_id {
+ #[inline]
+ fn $trait_method(self, y: Self) -> Self {
+ unsafe {
+ use crate::mem::transmute;
+ union U {
+ vec: $vec_id,
+ halves: [$vech_id; 2],
+ }
+
+ let mut x_halves = U { vec: self }.halves;
+ let y_halves = U { vec: y }.halves;
+
+ *x_halves.get_unchecked_mut(0) = transmute($fun(
+ transmute(*x_halves.get_unchecked(0)),
+ transmute(*y_halves.get_unchecked(0)),
+ ));
+ *x_halves.get_unchecked_mut(1) = transmute($fun(
+ transmute(*x_halves.get_unchecked(1)),
+ transmute(*y_halves.get_unchecked(1)),
+ ));
+
+ U { halves: x_halves }.vec
+ }
+ }
+ }
+ };
+ // implementation calling fun four times on each of the vector quarters:
+ (quarter | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+ $vecq_id:ident, $fun:ident) => {
+ impl $trait_id for $vec_id {
+ #[inline]
+ fn $trait_method(self, y: Self) -> Self {
+ unsafe {
+ use crate::mem::transmute;
+ union U {
+ vec: $vec_id,
+ quarters: [$vecq_id; 4],
+ }
+
+ let mut x_quarters = U { vec: self }.quarters;
+ let y_quarters = U { vec: y }.quarters;
+
+ *x_quarters.get_unchecked_mut(0) = transmute($fun(
+ transmute(*x_quarters.get_unchecked(0)),
+ transmute(*y_quarters.get_unchecked(0)),
+ ));
+
+ *x_quarters.get_unchecked_mut(1) = transmute($fun(
+ transmute(*x_quarters.get_unchecked(1)),
+ transmute(*y_quarters.get_unchecked(1)),
+ ));
+
+ *x_quarters.get_unchecked_mut(2) = transmute($fun(
+ transmute(*x_quarters.get_unchecked(2)),
+ transmute(*y_quarters.get_unchecked(2)),
+ ));
+
+ *x_quarters.get_unchecked_mut(3) = transmute($fun(
+ transmute(*x_quarters.get_unchecked(3)),
+ transmute(*y_quarters.get_unchecked(3)),
+ ));
+
+ U { quarters: x_quarters }.vec
+ }
+ }
+ }
+ };
+ // implementation calling fun once on a vector twice as large:
+ (twice | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+ $vect_id:ident, $fun:ident) => {
+ impl $trait_id for $vec_id {
+ #[inline]
+ fn $trait_method(self, y: Self) -> Self {
+ unsafe {
+ use crate::mem::{transmute, uninitialized};
+
+ union U {
+ vec: [$vec_id; 2],
+ twice: $vect_id,
+ }
+
+ let x_twice = U { vec: [self, uninitialized()] }.twice;
+ let y_twice = U { vec: [y, uninitialized()] }.twice;
+ let twice: $vect_id = transmute($fun(transmute(x_twice), transmute(y_twice)));
+
+ *(U { twice }.vec.get_unchecked(0))
+ }
+ }
+ }
+ };
+}
+
+macro_rules! gen_binary_impl_table {
+ ($trait_id:ident, $trait_method:ident) => {
+ macro_rules! impl_binary {
+ ($vid:ident: $fun:ident) => {
+ impl_binary_!(vec | $trait_id, $trait_method, $vid, $fun);
+ };
+ ($vid:ident[$sid:ident; $sc:expr]: $fun:ident) => {
+ impl_binary_!(scalar | $trait_id, $trait_method, $vid, [$sid; $sc], $fun);
+ };
+ ($vid:ident[s]: $fun:ident) => {
+ impl_binary_!(scalar | $trait_id, $trait_method, $vid, $fun);
+ };
+ ($vid:ident[h => $vid_h:ident]: $fun:ident) => {
+ impl_binary_!(halves | $trait_id, $trait_method, $vid, $vid_h, $fun);
+ };
+ ($vid:ident[q => $vid_q:ident]: $fun:ident) => {
+ impl_binary_!(quarter | $trait_id, $trait_method, $vid, $vid_q, $fun);
+ };
+ ($vid:ident[t => $vid_t:ident]: $fun:ident) => {
+ impl_binary_!(twice | $trait_id, $trait_method, $vid, $vid_t, $fun);
+ };
+ }
+ };
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/math/float/mul_add.rs b/third_party/rust/packed_simd_2/src/codegen/math/float/mul_add.rs
new file mode 100644
index 0000000000..d37f30fa86
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/math/float/mul_add.rs
@@ -0,0 +1,109 @@
+//! Vertical floating-point `mul_add`
+#![allow(unused)]
+use crate::*;
+
+// FIXME: 64-bit 1 element mul_add
+
+pub(crate) trait MulAdd {
+ fn mul_add(self, y: Self, z: Self) -> Self;
+}
+
+#[cfg(not(target_arch = "s390x"))]
+#[allow(improper_ctypes)]
+extern "C" {
+ #[link_name = "llvm.fma.v2f32"]
+ fn fma_v2f32(x: f32x2, y: f32x2, z: f32x2) -> f32x2;
+ #[link_name = "llvm.fma.v4f32"]
+ fn fma_v4f32(x: f32x4, y: f32x4, z: f32x4) -> f32x4;
+ #[link_name = "llvm.fma.v8f32"]
+ fn fma_v8f32(x: f32x8, y: f32x8, z: f32x8) -> f32x8;
+ #[link_name = "llvm.fma.v16f32"]
+ fn fma_v16f32(x: f32x16, y: f32x16, z: f32x16) -> f32x16;
+ /* FIXME 64-bit single elem vectors
+ #[link_name = "llvm.fma.v1f64"]
+ fn fma_v1f64(x: f64x1, y: f64x1, z: f64x1) -> f64x1;
+ */
+ #[link_name = "llvm.fma.v2f64"]
+ fn fma_v2f64(x: f64x2, y: f64x2, z: f64x2) -> f64x2;
+ #[link_name = "llvm.fma.v4f64"]
+ fn fma_v4f64(x: f64x4, y: f64x4, z: f64x4) -> f64x4;
+ #[link_name = "llvm.fma.v8f64"]
+ fn fma_v8f64(x: f64x8, y: f64x8, z: f64x8) -> f64x8;
+}
+
+gen_tertiary_impl_table!(MulAdd, mul_add);
+
+cfg_if! {
+ if #[cfg(target_arch = "s390x")] {
+ // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+ macro_rules! impl_broken {
+ ($id:ident) => {
+ impl MulAdd for $id {
+ #[inline]
+ fn mul_add(self, y: Self, z: Self) -> Self {
+ self * y + z
+ }
+ }
+ };
+ }
+
+ impl_broken!(f32x2);
+ impl_broken!(f32x4);
+ impl_broken!(f32x8);
+ impl_broken!(f32x16);
+
+ impl_broken!(f64x2);
+ impl_broken!(f64x4);
+ impl_broken!(f64x8);
+ } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+ use sleef_sys::*;
+ cfg_if! {
+ if #[cfg(target_feature = "avx2")] {
+ impl_tertiary!(f32x2[t => f32x4]: Sleef_fmaf4_avx2128);
+ impl_tertiary!(f32x16[h => f32x8]: Sleef_fmaf8_avx2);
+ impl_tertiary!(f64x8[h => f64x4]: Sleef_fmad4_avx2);
+
+ impl_tertiary!(f32x4: Sleef_fmaf4_avx2128);
+ impl_tertiary!(f32x8: Sleef_fmaf8_avx2);
+ impl_tertiary!(f64x2: Sleef_fmad2_avx2128);
+ impl_tertiary!(f64x4: Sleef_fmad4_avx2);
+ } else if #[cfg(target_feature = "avx")] {
+ impl_tertiary!(f32x2[t => f32x4]: Sleef_fmaf4_sse4);
+ impl_tertiary!(f32x16[h => f32x8]: Sleef_fmaf8_avx);
+ impl_tertiary!(f64x8[h => f64x4]: Sleef_fmad4_avx);
+
+ impl_tertiary!(f32x4: Sleef_fmaf4_sse4);
+ impl_tertiary!(f32x8: Sleef_fmaf8_avx);
+ impl_tertiary!(f64x2: Sleef_fmad2_sse4);
+ impl_tertiary!(f64x4: Sleef_fmad4_avx);
+ } else if #[cfg(target_feature = "sse4.2")] {
+ impl_tertiary!(f32x2[t => f32x4]: Sleef_fmaf4_sse4);
+ impl_tertiary!(f32x16[q => f32x4]: Sleef_fmaf4_sse4);
+ impl_tertiary!(f64x8[q => f64x2]: Sleef_fmad2_sse4);
+
+ impl_tertiary!(f32x4: Sleef_fmaf4_sse4);
+ impl_tertiary!(f32x8[h => f32x4]: Sleef_fmaf4_sse4);
+ impl_tertiary!(f64x2: Sleef_fmad2_sse4);
+ impl_tertiary!(f64x4[h => f64x2]: Sleef_fmad2_sse4);
+ } else {
+ impl_tertiary!(f32x2: fma_v2f32);
+ impl_tertiary!(f32x16: fma_v16f32);
+ impl_tertiary!(f64x8: fma_v8f64);
+
+ impl_tertiary!(f32x4: fma_v4f32);
+ impl_tertiary!(f32x8: fma_v8f32);
+ impl_tertiary!(f64x2: fma_v2f64);
+ impl_tertiary!(f64x4: fma_v4f64);
+ }
+ }
+ } else {
+ impl_tertiary!(f32x2: fma_v2f32);
+ impl_tertiary!(f32x4: fma_v4f32);
+ impl_tertiary!(f32x8: fma_v8f32);
+ impl_tertiary!(f32x16: fma_v16f32);
+ // impl_tertiary!(f64x1: fma_v1f64); // FIXME 64-bit fmagle elem vectors
+ impl_tertiary!(f64x2: fma_v2f64);
+ impl_tertiary!(f64x4: fma_v4f64);
+ impl_tertiary!(f64x8: fma_v8f64);
+ }
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/math/float/mul_adde.rs b/third_party/rust/packed_simd_2/src/codegen/math/float/mul_adde.rs
new file mode 100644
index 0000000000..c0baeacec2
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/math/float/mul_adde.rs
@@ -0,0 +1,60 @@
+//! Approximation for floating-point `mul_add`
+use crate::*;
+
+// FIXME: 64-bit 1 element mul_adde
+
+pub(crate) trait MulAddE {
+ fn mul_adde(self, y: Self, z: Self) -> Self;
+}
+
+#[cfg(not(target_arch = "s390x"))]
+#[allow(improper_ctypes)]
+extern "C" {
+ #[link_name = "llvm.fmuladd.v2f32"]
+ fn fmuladd_v2f32(x: f32x2, y: f32x2, z: f32x2) -> f32x2;
+ #[link_name = "llvm.fmuladd.v4f32"]
+ fn fmuladd_v4f32(x: f32x4, y: f32x4, z: f32x4) -> f32x4;
+ #[link_name = "llvm.fmuladd.v8f32"]
+ fn fmuladd_v8f32(x: f32x8, y: f32x8, z: f32x8) -> f32x8;
+ #[link_name = "llvm.fmuladd.v16f32"]
+ fn fmuladd_v16f32(x: f32x16, y: f32x16, z: f32x16) -> f32x16;
+ /* FIXME 64-bit single elem vectors
+ #[link_name = "llvm.fmuladd.v1f64"]
+ fn fmuladd_v1f64(x: f64x1, y: f64x1, z: f64x1) -> f64x1;
+ */
+ #[link_name = "llvm.fmuladd.v2f64"]
+ fn fmuladd_v2f64(x: f64x2, y: f64x2, z: f64x2) -> f64x2;
+ #[link_name = "llvm.fmuladd.v4f64"]
+ fn fmuladd_v4f64(x: f64x4, y: f64x4, z: f64x4) -> f64x4;
+ #[link_name = "llvm.fmuladd.v8f64"]
+ fn fmuladd_v8f64(x: f64x8, y: f64x8, z: f64x8) -> f64x8;
+}
+
+macro_rules! impl_mul_adde {
+ ($id:ident : $fn:ident) => {
+ impl MulAddE for $id {
+ #[inline]
+ fn mul_adde(self, y: Self, z: Self) -> Self {
+ #[cfg(not(target_arch = "s390x"))]
+ {
+ use crate::mem::transmute;
+ unsafe { transmute($fn(transmute(self), transmute(y), transmute(z))) }
+ }
+ #[cfg(target_arch = "s390x")]
+ {
+ // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+ self * y + z
+ }
+ }
+ }
+ };
+}
+
+impl_mul_adde!(f32x2: fmuladd_v2f32);
+impl_mul_adde!(f32x4: fmuladd_v4f32);
+impl_mul_adde!(f32x8: fmuladd_v8f32);
+impl_mul_adde!(f32x16: fmuladd_v16f32);
+// impl_mul_adde!(f64x1: fma_v1f64); // FIXME 64-bit fmagle elem vectors
+impl_mul_adde!(f64x2: fmuladd_v2f64);
+impl_mul_adde!(f64x4: fmuladd_v4f64);
+impl_mul_adde!(f64x8: fmuladd_v8f64);
diff --git a/third_party/rust/packed_simd_2/src/codegen/math/float/powf.rs b/third_party/rust/packed_simd_2/src/codegen/math/float/powf.rs
new file mode 100644
index 0000000000..89ca52e96d
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/math/float/powf.rs
@@ -0,0 +1,112 @@
+//! Vertical floating-point `powf`
+#![allow(unused)]
+
+// FIXME 64-bit powfgle elem vectors mispowfg
+
+use crate::*;
+
+pub(crate) trait Powf {
+ fn powf(self, x: Self) -> Self;
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+ #[link_name = "llvm.pow.v2f32"]
+ fn powf_v2f32(x: f32x2, y: f32x2) -> f32x2;
+ #[link_name = "llvm.pow.v4f32"]
+ fn powf_v4f32(x: f32x4, y: f32x4) -> f32x4;
+ #[link_name = "llvm.pow.v8f32"]
+ fn powf_v8f32(x: f32x8, y: f32x8) -> f32x8;
+ #[link_name = "llvm.pow.v16f32"]
+ fn powf_v16f32(x: f32x16, y: f32x16) -> f32x16;
+ /* FIXME 64-bit powfgle elem vectors
+ #[link_name = "llvm.pow.v1f64"]
+ fn powf_v1f64(x: f64x1, y: f64x1) -> f64x1;
+ */
+ #[link_name = "llvm.pow.v2f64"]
+ fn powf_v2f64(x: f64x2, y: f64x2) -> f64x2;
+ #[link_name = "llvm.pow.v4f64"]
+ fn powf_v4f64(x: f64x4, y: f64x4) -> f64x4;
+ #[link_name = "llvm.pow.v8f64"]
+ fn powf_v8f64(x: f64x8, y: f64x8) -> f64x8;
+
+ #[link_name = "llvm.pow.f32"]
+ fn powf_f32(x: f32, y: f32) -> f32;
+ #[link_name = "llvm.pow.f64"]
+ fn powf_f64(x: f64, y: f64) -> f64;
+}
+
+gen_binary_impl_table!(Powf, powf);
+
+cfg_if! {
+ if #[cfg(target_arch = "s390x")] {
+ // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+ impl_binary!(f32x2[f32; 2]: powf_f32);
+ impl_binary!(f32x4[f32; 4]: powf_f32);
+ impl_binary!(f32x8[f32; 8]: powf_f32);
+ impl_binary!(f32x16[f32; 16]: powf_f32);
+
+ impl_binary!(f64x2[f64; 2]: powf_f64);
+ impl_binary!(f64x4[f64; 4]: powf_f64);
+ impl_binary!(f64x8[f64; 8]: powf_f64);
+ } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+ use sleef_sys::*;
+ cfg_if! {
+ if #[cfg(target_feature = "avx2")] {
+ impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10avx2128);
+ impl_binary!(f32x16[h => f32x8]: Sleef_powf8_u10avx2);
+ impl_binary!(f64x8[h => f64x4]: Sleef_powd4_u10avx2);
+
+ impl_binary!(f32x4: Sleef_powf4_u10avx2128);
+ impl_binary!(f32x8: Sleef_powf8_u10avx2);
+ impl_binary!(f64x2: Sleef_powd2_u10avx2128);
+ impl_binary!(f64x4: Sleef_powd4_u10avx2);
+ } else if #[cfg(target_feature = "avx")] {
+ impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10sse4);
+ impl_binary!(f32x16[h => f32x8]: Sleef_powf8_u10avx);
+ impl_binary!(f64x8[h => f64x4]: Sleef_powd4_u10avx);
+
+ impl_binary!(f32x4: Sleef_powf4_u10sse4);
+ impl_binary!(f32x8: Sleef_powf8_u10avx);
+ impl_binary!(f64x2: Sleef_powd2_u10sse4);
+ impl_binary!(f64x4: Sleef_powd4_u10avx);
+ } else if #[cfg(target_feature = "sse4.2")] {
+ impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10sse4);
+ impl_binary!(f32x16[q => f32x4]: Sleef_powf4_u10sse4);
+ impl_binary!(f64x8[q => f64x2]: Sleef_powd2_u10sse4);
+
+ impl_binary!(f32x4: Sleef_powf4_u10sse4);
+ impl_binary!(f32x8[h => f32x4]: Sleef_powf4_u10sse4);
+ impl_binary!(f64x2: Sleef_powd2_u10sse4);
+ impl_binary!(f64x4[h => f64x2]: Sleef_powd2_u10sse4);
+ } else if #[cfg(target_feature = "sse2")] {
+ impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10sse2);
+ impl_binary!(f32x16[q => f32x4]: Sleef_powf4_u10sse2);
+ impl_binary!(f64x8[q => f64x2]: Sleef_powd2_u10sse2);
+
+ impl_binary!(f32x4: Sleef_powf4_u10sse2);
+ impl_binary!(f32x8[h => f32x4]: Sleef_powf4_u10sse2);
+ impl_binary!(f64x2: Sleef_powd2_u10sse2);
+ impl_binary!(f64x4[h => f64x2]: Sleef_powd2_u10sse2);
+ } else {
+ impl_binary!(f32x2[f32; 2]: powf_f32);
+ impl_binary!(f32x4: powf_v4f32);
+ impl_binary!(f32x8: powf_v8f32);
+ impl_binary!(f32x16: powf_v16f32);
+
+ impl_binary!(f64x2: powf_v2f64);
+ impl_binary!(f64x4: powf_v4f64);
+ impl_binary!(f64x8: powf_v8f64);
+ }
+ }
+ } else {
+ impl_binary!(f32x2[f32; 2]: powf_f32);
+ impl_binary!(f32x4: powf_v4f32);
+ impl_binary!(f32x8: powf_v8f32);
+ impl_binary!(f32x16: powf_v16f32);
+
+ impl_binary!(f64x2: powf_v2f64);
+ impl_binary!(f64x4: powf_v4f64);
+ impl_binary!(f64x8: powf_v8f64);
+ }
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/math/float/sin.rs b/third_party/rust/packed_simd_2/src/codegen/math/float/sin.rs
new file mode 100644
index 0000000000..d881415909
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/math/float/sin.rs
@@ -0,0 +1,103 @@
+//! Vertical floating-point `sin`
+#![allow(unused)]
+
+// FIXME 64-bit 1 elem vectors sin
+
+use crate::*;
+
+pub(crate) trait Sin {
+ fn sin(self) -> Self;
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+ #[link_name = "llvm.sin.v2f32"]
+ fn sin_v2f32(x: f32x2) -> f32x2;
+ #[link_name = "llvm.sin.v4f32"]
+ fn sin_v4f32(x: f32x4) -> f32x4;
+ #[link_name = "llvm.sin.v8f32"]
+ fn sin_v8f32(x: f32x8) -> f32x8;
+ #[link_name = "llvm.sin.v16f32"]
+ fn sin_v16f32(x: f32x16) -> f32x16;
+ /* FIXME 64-bit single elem vectors
+ #[link_name = "llvm.sin.v1f64"]
+ fn sin_v1f64(x: f64x1) -> f64x1;
+ */
+ #[link_name = "llvm.sin.v2f64"]
+ fn sin_v2f64(x: f64x2) -> f64x2;
+ #[link_name = "llvm.sin.v4f64"]
+ fn sin_v4f64(x: f64x4) -> f64x4;
+ #[link_name = "llvm.sin.v8f64"]
+ fn sin_v8f64(x: f64x8) -> f64x8;
+
+ #[link_name = "llvm.sin.f32"]
+ fn sin_f32(x: f32) -> f32;
+ #[link_name = "llvm.sin.f64"]
+ fn sin_f64(x: f64) -> f64;
+}
+
+gen_unary_impl_table!(Sin, sin);
+
+cfg_if! {
+ if #[cfg(target_arch = "s390x")] {
+ // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+ impl_unary!(f32x2[f32; 2]: sin_f32);
+ impl_unary!(f32x4[f32; 4]: sin_f32);
+ impl_unary!(f32x8[f32; 8]: sin_f32);
+ impl_unary!(f32x16[f32; 16]: sin_f32);
+
+ impl_unary!(f64x2[f64; 2]: sin_f64);
+ impl_unary!(f64x4[f64; 4]: sin_f64);
+ impl_unary!(f64x8[f64; 8]: sin_f64);
+ } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+ use sleef_sys::*;
+ cfg_if! {
+ if #[cfg(target_feature = "avx2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_sinf4_u10avx2128);
+ impl_unary!(f32x16[h => f32x8]: Sleef_sinf8_u10avx2);
+ impl_unary!(f64x8[h => f64x4]: Sleef_sind4_u10avx2);
+
+ impl_unary!(f32x4: Sleef_sinf4_u10avx2128);
+ impl_unary!(f32x8: Sleef_sinf8_u10avx2);
+ impl_unary!(f64x2: Sleef_sind2_u10avx2128);
+ impl_unary!(f64x4: Sleef_sind4_u10avx2);
+ } else if #[cfg(target_feature = "avx")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_sinf4_u10sse4);
+ impl_unary!(f32x16[h => f32x8]: Sleef_sinf8_u10avx);
+ impl_unary!(f64x8[h => f64x4]: Sleef_sind4_u10avx);
+
+ impl_unary!(f32x4: Sleef_sinf4_u10sse4);
+ impl_unary!(f32x8: Sleef_sinf8_u10avx);
+ impl_unary!(f64x2: Sleef_sind2_u10sse4);
+ impl_unary!(f64x4: Sleef_sind4_u10avx);
+ } else if #[cfg(target_feature = "sse4.2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_sinf4_u10sse4);
+ impl_unary!(f32x16[q => f32x4]: Sleef_sinf4_u10sse4);
+ impl_unary!(f64x8[q => f64x2]: Sleef_sind2_u10sse4);
+
+ impl_unary!(f32x4: Sleef_sinf4_u10sse4);
+ impl_unary!(f32x8[h => f32x4]: Sleef_sinf4_u10sse4);
+ impl_unary!(f64x2: Sleef_sind2_u10sse4);
+ impl_unary!(f64x4[h => f64x2]: Sleef_sind2_u10sse4);
+ } else {
+ impl_unary!(f32x2[f32; 2]: sin_f32);
+ impl_unary!(f32x16: sin_v16f32);
+ impl_unary!(f64x8: sin_v8f64);
+
+ impl_unary!(f32x4: sin_v4f32);
+ impl_unary!(f32x8: sin_v8f32);
+ impl_unary!(f64x2: sin_v2f64);
+ impl_unary!(f64x4: sin_v4f64);
+ }
+ }
+ } else {
+ impl_unary!(f32x2[f32; 2]: sin_f32);
+ impl_unary!(f32x4: sin_v4f32);
+ impl_unary!(f32x8: sin_v8f32);
+ impl_unary!(f32x16: sin_v16f32);
+
+ impl_unary!(f64x2: sin_v2f64);
+ impl_unary!(f64x4: sin_v4f64);
+ impl_unary!(f64x8: sin_v8f64);
+ }
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/math/float/sin_cos_pi.rs b/third_party/rust/packed_simd_2/src/codegen/math/float/sin_cos_pi.rs
new file mode 100644
index 0000000000..b283d11111
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/math/float/sin_cos_pi.rs
@@ -0,0 +1,188 @@
+//! Vertical floating-point `sin_cos`
+#![allow(unused)]
+
+// FIXME 64-bit 1 elem vectors sin_cos
+
+use crate::*;
+
+pub(crate) trait SinCosPi: Sized {
+ type Output;
+ fn sin_cos_pi(self) -> Self::Output;
+}
+
+macro_rules! impl_def {
+ ($vid:ident, $PI:path) => {
+ impl SinCosPi for $vid {
+ type Output = (Self, Self);
+ #[inline]
+ fn sin_cos_pi(self) -> Self::Output {
+ let v = self * Self::splat($PI);
+ (v.sin(), v.cos())
+ }
+ }
+ };
+}
+
+macro_rules! impl_def32 {
+ ($vid:ident) => {
+ impl_def!($vid, crate::f32::consts::PI);
+ };
+}
+macro_rules! impl_def64 {
+ ($vid:ident) => {
+ impl_def!($vid, crate::f64::consts::PI);
+ };
+}
+
+macro_rules! impl_unary_t {
+ ($vid:ident: $fun:ident) => {
+ impl SinCosPi for $vid {
+ type Output = (Self, Self);
+ fn sin_cos_pi(self) -> Self::Output {
+ unsafe {
+ use crate::mem::transmute;
+ transmute($fun(transmute(self)))
+ }
+ }
+ }
+ };
+ ($vid:ident[t => $vid_t:ident]: $fun:ident) => {
+ impl SinCosPi for $vid {
+ type Output = (Self, Self);
+ fn sin_cos_pi(self) -> Self::Output {
+ unsafe {
+ use crate::mem::{transmute, uninitialized};
+
+ union U {
+ vec: [$vid; 2],
+ twice: $vid_t,
+ }
+
+ let twice = U { vec: [self, uninitialized()] }.twice;
+ let twice = transmute($fun(transmute(twice)));
+
+ union R {
+ twice: ($vid_t, $vid_t),
+ vecs: ([$vid; 2], [$vid; 2]),
+ }
+ let r = R { twice }.vecs;
+ (*r.0.get_unchecked(0), *r.0.get_unchecked(1))
+ }
+ }
+ }
+ };
+ ($vid:ident[h => $vid_h:ident]: $fun:ident) => {
+ impl SinCosPi for $vid {
+ type Output = (Self, Self);
+ fn sin_cos_pi(self) -> Self::Output {
+ unsafe {
+ use crate::mem::transmute;
+
+ union U {
+ vec: $vid,
+ halves: [$vid_h; 2],
+ }
+
+ let halves = U { vec: self }.halves;
+
+ let res_0: ($vid_h, $vid_h) = transmute($fun(transmute(*halves.get_unchecked(0))));
+ let res_1: ($vid_h, $vid_h) = transmute($fun(transmute(*halves.get_unchecked(1))));
+
+ union R {
+ result: ($vid, $vid),
+ halves: ([$vid_h; 2], [$vid_h; 2]),
+ }
+ R { halves: ([res_0.0, res_1.0], [res_0.1, res_1.1]) }.result
+ }
+ }
+ }
+ };
+ ($vid:ident[q => $vid_q:ident]: $fun:ident) => {
+ impl SinCosPi for $vid {
+ type Output = (Self, Self);
+ fn sin_cos_pi(self) -> Self::Output {
+ unsafe {
+ use crate::mem::transmute;
+
+ union U {
+ vec: $vid,
+ quarters: [$vid_q; 4],
+ }
+
+ let quarters = U { vec: self }.quarters;
+
+ let res_0: ($vid_q, $vid_q) = transmute($fun(transmute(*quarters.get_unchecked(0))));
+ let res_1: ($vid_q, $vid_q) = transmute($fun(transmute(*quarters.get_unchecked(1))));
+ let res_2: ($vid_q, $vid_q) = transmute($fun(transmute(*quarters.get_unchecked(2))));
+ let res_3: ($vid_q, $vid_q) = transmute($fun(transmute(*quarters.get_unchecked(3))));
+
+ union R {
+ result: ($vid, $vid),
+ quarters: ([$vid_q; 4], [$vid_q; 4]),
+ }
+ R {
+ quarters: (
+ [res_0.0, res_1.0, res_2.0, res_3.0],
+ [res_0.1, res_1.1, res_2.1, res_3.1],
+ ),
+ }
+ .result
+ }
+ }
+ }
+ };
+}
+
+cfg_if! {
+ if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+ use sleef_sys::*;
+ cfg_if! {
+ if #[cfg(target_feature = "avx2")] {
+ impl_unary_t!(f32x2[t => f32x4]: Sleef_sincospif4_u05avx2128);
+ impl_unary_t!(f32x16[h => f32x8]: Sleef_sincospif8_u05avx2);
+ impl_unary_t!(f64x8[h => f64x4]: Sleef_sincospid4_u05avx2);
+
+ impl_unary_t!(f32x4: Sleef_sincospif4_u05avx2128);
+ impl_unary_t!(f32x8: Sleef_sincospif8_u05avx2);
+ impl_unary_t!(f64x2: Sleef_sincospid2_u05avx2128);
+ impl_unary_t!(f64x4: Sleef_sincospid4_u05avx2);
+ } else if #[cfg(target_feature = "avx")] {
+ impl_unary_t!(f32x2[t => f32x4]: Sleef_sincospif4_u05sse4);
+ impl_unary_t!(f32x16[h => f32x8]: Sleef_sincospif8_u05avx);
+ impl_unary_t!(f64x8[h => f64x4]: Sleef_sincospid4_u05avx);
+
+ impl_unary_t!(f32x4: Sleef_sincospif4_u05sse4);
+ impl_unary_t!(f32x8: Sleef_sincospif8_u05avx);
+ impl_unary_t!(f64x2: Sleef_sincospid2_u05sse4);
+ impl_unary_t!(f64x4: Sleef_sincospid4_u05avx);
+ } else if #[cfg(target_feature = "sse4.2")] {
+ impl_unary_t!(f32x2[t => f32x4]: Sleef_sincospif4_u05sse4);
+ impl_unary_t!(f32x16[q => f32x4]: Sleef_sincospif4_u05sse4);
+ impl_unary_t!(f64x8[q => f64x2]: Sleef_sincospid2_u05sse4);
+
+ impl_unary_t!(f32x4: Sleef_sincospif4_u05sse4);
+ impl_unary_t!(f32x8[h => f32x4]: Sleef_sincospif4_u05sse4);
+ impl_unary_t!(f64x2: Sleef_sincospid2_u05sse4);
+ impl_unary_t!(f64x4[h => f64x2]: Sleef_sincospid2_u05sse4);
+ } else {
+ impl_def32!(f32x2);
+ impl_def32!(f32x4);
+ impl_def32!(f32x8);
+ impl_def32!(f32x16);
+
+ impl_def64!(f64x2);
+ impl_def64!(f64x4);
+ impl_def64!(f64x8);
+ }
+ }
+ } else {
+ impl_def32!(f32x2);
+ impl_def32!(f32x4);
+ impl_def32!(f32x8);
+ impl_def32!(f32x16);
+
+ impl_def64!(f64x2);
+ impl_def64!(f64x4);
+ impl_def64!(f64x8);
+ }
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/math/float/sin_pi.rs b/third_party/rust/packed_simd_2/src/codegen/math/float/sin_pi.rs
new file mode 100644
index 0000000000..0c8f6bb120
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/math/float/sin_pi.rs
@@ -0,0 +1,87 @@
+//! Vertical floating-point `sin_pi`
+#![allow(unused)]
+
+// FIXME 64-bit 1 elem vectors sin_pi
+
+use crate::*;
+
+pub(crate) trait SinPi {
+ fn sin_pi(self) -> Self;
+}
+
+gen_unary_impl_table!(SinPi, sin_pi);
+
+macro_rules! impl_def {
+ ($vid:ident, $PI:path) => {
+ impl SinPi for $vid {
+ #[inline]
+ fn sin_pi(self) -> Self {
+ (self * Self::splat($PI)).sin()
+ }
+ }
+ };
+}
+macro_rules! impl_def32 {
+ ($vid:ident) => {
+ impl_def!($vid, crate::f32::consts::PI);
+ };
+}
+macro_rules! impl_def64 {
+ ($vid:ident) => {
+ impl_def!($vid, crate::f64::consts::PI);
+ };
+}
+
+cfg_if! {
+ if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+ use sleef_sys::*;
+ cfg_if! {
+ if #[cfg(target_feature = "avx2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_sinpif4_u05avx2128);
+ impl_unary!(f32x16[h => f32x8]: Sleef_sinpif8_u05avx2);
+ impl_unary!(f64x8[h => f64x4]: Sleef_sinpid4_u05avx2);
+
+ impl_unary!(f32x4: Sleef_sinpif4_u05avx2128);
+ impl_unary!(f32x8: Sleef_sinpif8_u05avx2);
+ impl_unary!(f64x2: Sleef_sinpid2_u05avx2128);
+ impl_unary!(f64x4: Sleef_sinpid4_u05avx2);
+ } else if #[cfg(target_feature = "avx")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_sinpif4_u05sse4);
+ impl_unary!(f32x16[h => f32x8]: Sleef_sinpif8_u05avx);
+ impl_unary!(f64x8[h => f64x4]: Sleef_sinpid4_u05avx);
+
+ impl_unary!(f32x4: Sleef_sinpif4_u05sse4);
+ impl_unary!(f32x8: Sleef_sinpif8_u05avx);
+ impl_unary!(f64x2: Sleef_sinpid2_u05sse4);
+ impl_unary!(f64x4: Sleef_sinpid4_u05avx);
+ } else if #[cfg(target_feature = "sse4.2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_sinpif4_u05sse4);
+ impl_unary!(f32x16[q => f32x4]: Sleef_sinpif4_u05sse4);
+ impl_unary!(f64x8[q => f64x2]: Sleef_sinpid2_u05sse4);
+
+ impl_unary!(f32x4: Sleef_sinpif4_u05sse4);
+ impl_unary!(f32x8[h => f32x4]: Sleef_sinpif4_u05sse4);
+ impl_unary!(f64x2: Sleef_sinpid2_u05sse4);
+ impl_unary!(f64x4[h => f64x2]: Sleef_sinpid2_u05sse4);
+ } else {
+ impl_def32!(f32x2);
+ impl_def32!(f32x4);
+ impl_def32!(f32x8);
+ impl_def32!(f32x16);
+
+ impl_def64!(f64x2);
+ impl_def64!(f64x4);
+ impl_def64!(f64x8);
+ }
+ }
+ } else {
+ impl_def32!(f32x2);
+ impl_def32!(f32x4);
+ impl_def32!(f32x8);
+ impl_def32!(f32x16);
+
+ impl_def64!(f64x2);
+ impl_def64!(f64x4);
+ impl_def64!(f64x8);
+ }
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/math/float/sqrt.rs b/third_party/rust/packed_simd_2/src/codegen/math/float/sqrt.rs
new file mode 100644
index 0000000000..67bb0a2a9c
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/math/float/sqrt.rs
@@ -0,0 +1,103 @@
+//! Vertical floating-point `sqrt`
+#![allow(unused)]
+
+// FIXME 64-bit 1 elem vectors sqrt
+
+use crate::*;
+
+pub(crate) trait Sqrt {
+ fn sqrt(self) -> Self;
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+ #[link_name = "llvm.sqrt.v2f32"]
+ fn sqrt_v2f32(x: f32x2) -> f32x2;
+ #[link_name = "llvm.sqrt.v4f32"]
+ fn sqrt_v4f32(x: f32x4) -> f32x4;
+ #[link_name = "llvm.sqrt.v8f32"]
+ fn sqrt_v8f32(x: f32x8) -> f32x8;
+ #[link_name = "llvm.sqrt.v16f32"]
+ fn sqrt_v16f32(x: f32x16) -> f32x16;
+ /* FIXME 64-bit sqrtgle elem vectors
+ #[link_name = "llvm.sqrt.v1f64"]
+ fn sqrt_v1f64(x: f64x1) -> f64x1;
+ */
+ #[link_name = "llvm.sqrt.v2f64"]
+ fn sqrt_v2f64(x: f64x2) -> f64x2;
+ #[link_name = "llvm.sqrt.v4f64"]
+ fn sqrt_v4f64(x: f64x4) -> f64x4;
+ #[link_name = "llvm.sqrt.v8f64"]
+ fn sqrt_v8f64(x: f64x8) -> f64x8;
+
+ #[link_name = "llvm.sqrt.f32"]
+ fn sqrt_f32(x: f32) -> f32;
+ #[link_name = "llvm.sqrt.f64"]
+ fn sqrt_f64(x: f64) -> f64;
+}
+
+gen_unary_impl_table!(Sqrt, sqrt);
+
+cfg_if! {
+ if #[cfg(target_arch = "s390x")] {
+ // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+ impl_unary!(f32x2[f32; 2]: sqrt_f32);
+ impl_unary!(f32x4[f32; 4]: sqrt_f32);
+ impl_unary!(f32x8[f32; 8]: sqrt_f32);
+ impl_unary!(f32x16[f32; 16]: sqrt_f32);
+
+ impl_unary!(f64x2[f64; 2]: sqrt_f64);
+ impl_unary!(f64x4[f64; 4]: sqrt_f64);
+ impl_unary!(f64x8[f64; 8]: sqrt_f64);
+ } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+ use sleef_sys::*;
+ cfg_if! {
+ if #[cfg(target_feature = "avx2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_avx2128);
+ impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_avx2);
+ impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_avx2);
+
+ impl_unary!(f32x4: Sleef_sqrtf4_avx2128);
+ impl_unary!(f32x8: Sleef_sqrtf8_avx2);
+ impl_unary!(f64x2: Sleef_sqrtd2_avx2128);
+ impl_unary!(f64x4: Sleef_sqrtd4_avx2);
+ } else if #[cfg(target_feature = "avx")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_sse4);
+ impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_avx);
+ impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_avx);
+
+ impl_unary!(f32x4: Sleef_sqrtf4_sse4);
+ impl_unary!(f32x8: Sleef_sqrtf8_avx);
+ impl_unary!(f64x2: Sleef_sqrtd2_sse4);
+ impl_unary!(f64x4: Sleef_sqrtd4_avx);
+ } else if #[cfg(target_feature = "sse4.2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_sse4);
+ impl_unary!(f32x16[q => f32x4]: Sleef_sqrtf4_sse4);
+ impl_unary!(f64x8[q => f64x2]: Sleef_sqrtd2_sse4);
+
+ impl_unary!(f32x4: Sleef_sqrtf4_sse4);
+ impl_unary!(f32x8[h => f32x4]: Sleef_sqrtf4_sse4);
+ impl_unary!(f64x2: Sleef_sqrtd2_sse4);
+ impl_unary!(f64x4[h => f64x2]: Sleef_sqrtd2_sse4);
+ } else {
+ impl_unary!(f32x2[f32; 2]: sqrt_f32);
+ impl_unary!(f32x16: sqrt_v16f32);
+ impl_unary!(f64x8: sqrt_v8f64);
+
+ impl_unary!(f32x4: sqrt_v4f32);
+ impl_unary!(f32x8: sqrt_v8f32);
+ impl_unary!(f64x2: sqrt_v2f64);
+ impl_unary!(f64x4: sqrt_v4f64);
+ }
+ }
+ } else {
+ impl_unary!(f32x2[f32; 2]: sqrt_f32);
+ impl_unary!(f32x4: sqrt_v4f32);
+ impl_unary!(f32x8: sqrt_v8f32);
+ impl_unary!(f32x16: sqrt_v16f32);
+
+ impl_unary!(f64x2: sqrt_v2f64);
+ impl_unary!(f64x4: sqrt_v4f64);
+ impl_unary!(f64x8: sqrt_v8f64);
+ }
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/math/float/sqrte.rs b/third_party/rust/packed_simd_2/src/codegen/math/float/sqrte.rs
new file mode 100644
index 0000000000..58a1de1f40
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/math/float/sqrte.rs
@@ -0,0 +1,67 @@
+//! Vertical floating-point `sqrt`
+#![allow(unused)]
+
+// FIXME 64-bit 1 elem vectors sqrte
+
+use crate::llvm::simd_fsqrt;
+use crate::*;
+
+pub(crate) trait Sqrte {
+ fn sqrte(self) -> Self;
+}
+
+gen_unary_impl_table!(Sqrte, sqrte);
+
+cfg_if! {
+ if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+ use sleef_sys::*;
+ cfg_if! {
+ if #[cfg(target_feature = "avx2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_u35avx2128);
+ impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_u35avx2);
+ impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_u35avx2);
+
+ impl_unary!(f32x4: Sleef_sqrtf4_u35avx2128);
+ impl_unary!(f32x8: Sleef_sqrtf8_u35avx2);
+ impl_unary!(f64x2: Sleef_sqrtd2_u35avx2128);
+ impl_unary!(f64x4: Sleef_sqrtd4_u35avx2);
+ } else if #[cfg(target_feature = "avx")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_u35sse4);
+ impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_u35avx);
+ impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_u35avx);
+
+ impl_unary!(f32x4: Sleef_sqrtf4_u35sse4);
+ impl_unary!(f32x8: Sleef_sqrtf8_u35avx);
+ impl_unary!(f64x2: Sleef_sqrtd2_u35sse4);
+ impl_unary!(f64x4: Sleef_sqrtd4_u35avx);
+ } else if #[cfg(target_feature = "sse4.2")] {
+ impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_u35sse4);
+ impl_unary!(f32x16[q => f32x4]: Sleef_sqrtf4_u35sse4);
+ impl_unary!(f64x8[q => f64x2]: Sleef_sqrtd2_u35sse4);
+
+ impl_unary!(f32x4: Sleef_sqrtf4_u35sse4);
+ impl_unary!(f32x8[h => f32x4]: Sleef_sqrtf4_u35sse4);
+ impl_unary!(f64x2: Sleef_sqrtd2_u35sse4);
+ impl_unary!(f64x4[h => f64x2]: Sleef_sqrtd2_u35sse4);
+ } else {
+ impl_unary!(f32x2[g]: simd_fsqrt);
+ impl_unary!(f32x16[g]: simd_fsqrt);
+ impl_unary!(f64x8[g]: simd_fsqrt);
+
+ impl_unary!(f32x4[g]: simd_fsqrt);
+ impl_unary!(f32x8[g]: simd_fsqrt);
+ impl_unary!(f64x2[g]: simd_fsqrt);
+ impl_unary!(f64x4[g]: simd_fsqrt);
+ }
+ }
+ } else {
+ impl_unary!(f32x2[g]: simd_fsqrt);
+ impl_unary!(f32x4[g]: simd_fsqrt);
+ impl_unary!(f32x8[g]: simd_fsqrt);
+ impl_unary!(f32x16[g]: simd_fsqrt);
+
+ impl_unary!(f64x2[g]: simd_fsqrt);
+ impl_unary!(f64x4[g]: simd_fsqrt);
+ impl_unary!(f64x8[g]: simd_fsqrt);
+ }
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/pointer_sized_int.rs b/third_party/rust/packed_simd_2/src/codegen/pointer_sized_int.rs
new file mode 100644
index 0000000000..55cbc297aa
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/pointer_sized_int.rs
@@ -0,0 +1,28 @@
+//! Provides `isize` and `usize`
+
+use cfg_if::cfg_if;
+
+cfg_if! {
+ if #[cfg(target_pointer_width = "8")] {
+ pub(crate) type isize_ = i8;
+ pub(crate) type usize_ = u8;
+ } else if #[cfg(target_pointer_width = "16")] {
+ pub(crate) type isize_ = i16;
+ pub(crate) type usize_ = u16;
+ } else if #[cfg(target_pointer_width = "32")] {
+ pub(crate) type isize_ = i32;
+ pub(crate) type usize_ = u32;
+
+ } else if #[cfg(target_pointer_width = "64")] {
+ pub(crate) type isize_ = i64;
+ pub(crate) type usize_ = u64;
+ } else if #[cfg(target_pointer_width = "64")] {
+ pub(crate) type isize_ = i64;
+ pub(crate) type usize_ = u64;
+ } else if #[cfg(target_pointer_width = "128")] {
+ pub(crate) type isize_ = i128;
+ pub(crate) type usize_ = u128;
+ } else {
+ compile_error!("unsupported target_pointer_width");
+ }
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/reductions.rs b/third_party/rust/packed_simd_2/src/codegen/reductions.rs
new file mode 100644
index 0000000000..302ca6d88f
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/reductions.rs
@@ -0,0 +1 @@
+pub(crate) mod mask;
diff --git a/third_party/rust/packed_simd_2/src/codegen/reductions/mask.rs b/third_party/rust/packed_simd_2/src/codegen/reductions/mask.rs
new file mode 100644
index 0000000000..a78bcc5632
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/reductions/mask.rs
@@ -0,0 +1,69 @@
+//! Code generation workaround for `all()` mask horizontal reduction.
+//!
+//! Works around [LLVM bug 36702].
+//!
+//! [LLVM bug 36702]: https://bugs.llvm.org/show_bug.cgi?id=36702
+#![allow(unused_macros)]
+
+use crate::*;
+
+pub(crate) trait All: crate::marker::Sized {
+ unsafe fn all(self) -> bool;
+}
+
+pub(crate) trait Any: crate::marker::Sized {
+ unsafe fn any(self) -> bool;
+}
+
+#[macro_use]
+mod fallback_impl;
+
+cfg_if! {
+ if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+ #[macro_use]
+ mod x86;
+ } else if #[cfg(all(target_arch = "arm", target_feature = "v7",
+ target_feature = "neon",
+ any(feature = "core_arch", libcore_neon)))] {
+ #[macro_use]
+ mod arm;
+ } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
+ #[macro_use]
+ mod aarch64;
+ } else {
+ #[macro_use]
+ mod fallback;
+ }
+}
+
+impl_mask_reductions!(m8x2);
+impl_mask_reductions!(m8x4);
+impl_mask_reductions!(m8x8);
+impl_mask_reductions!(m8x16);
+impl_mask_reductions!(m8x32);
+impl_mask_reductions!(m8x64);
+
+impl_mask_reductions!(m16x2);
+impl_mask_reductions!(m16x4);
+impl_mask_reductions!(m16x8);
+impl_mask_reductions!(m16x16);
+impl_mask_reductions!(m16x32);
+
+impl_mask_reductions!(m32x2);
+impl_mask_reductions!(m32x4);
+impl_mask_reductions!(m32x8);
+impl_mask_reductions!(m32x16);
+
+// FIXME: 64-bit single element vector
+// impl_mask_reductions!(m64x1);
+impl_mask_reductions!(m64x2);
+impl_mask_reductions!(m64x4);
+impl_mask_reductions!(m64x8);
+
+impl_mask_reductions!(m128x1);
+impl_mask_reductions!(m128x2);
+impl_mask_reductions!(m128x4);
+
+impl_mask_reductions!(msizex2);
+impl_mask_reductions!(msizex4);
+impl_mask_reductions!(msizex8);
diff --git a/third_party/rust/packed_simd_2/src/codegen/reductions/mask/aarch64.rs b/third_party/rust/packed_simd_2/src/codegen/reductions/mask/aarch64.rs
new file mode 100644
index 0000000000..b2db52c891
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/reductions/mask/aarch64.rs
@@ -0,0 +1,81 @@
+//! Mask reductions implementation for `aarch64` targets
+
+/// 128-bit wide vectors
+macro_rules! aarch64_128_neon_impl {
+ ($id:ident, $vmin:ident, $vmax:ident) => {
+ impl All for $id {
+ #[inline]
+ #[target_feature(enable = "neon")]
+ unsafe fn all(self) -> bool {
+ use crate::arch::aarch64::$vmin;
+ $vmin(crate::mem::transmute(self)) != 0
+ }
+ }
+ impl Any for $id {
+ #[inline]
+ #[target_feature(enable = "neon")]
+ unsafe fn any(self) -> bool {
+ use crate::arch::aarch64::$vmax;
+ $vmax(crate::mem::transmute(self)) != 0
+ }
+ }
+ };
+}
+
+/// 64-bit wide vectors
+macro_rules! aarch64_64_neon_impl {
+ ($id:ident, $vec128:ident) => {
+ impl All for $id {
+ #[inline]
+ #[target_feature(enable = "neon")]
+ unsafe fn all(self) -> bool {
+ // Duplicates the 64-bit vector into a 128-bit one and
+ // calls all on that.
+ union U {
+ halves: ($id, $id),
+ vec: $vec128,
+ }
+ U { halves: (self, self) }.vec.all()
+ }
+ }
+ impl Any for $id {
+ #[inline]
+ #[target_feature(enable = "neon")]
+ unsafe fn any(self) -> bool {
+ union U {
+ halves: ($id, $id),
+ vec: $vec128,
+ }
+ U { halves: (self, self) }.vec.any()
+ }
+ }
+ };
+}
+
+/// Mask reduction implementation for `aarch64` targets
+macro_rules! impl_mask_reductions {
+ // 64-bit wide masks
+ (m8x8) => {
+ aarch64_64_neon_impl!(m8x8, m8x16);
+ };
+ (m16x4) => {
+ aarch64_64_neon_impl!(m16x4, m16x8);
+ };
+ (m32x2) => {
+ aarch64_64_neon_impl!(m32x2, m32x4);
+ };
+ // 128-bit wide masks
+ (m8x16) => {
+ aarch64_128_neon_impl!(m8x16, vminvq_u8, vmaxvq_u8);
+ };
+ (m16x8) => {
+ aarch64_128_neon_impl!(m16x8, vminvq_u16, vmaxvq_u16);
+ };
+ (m32x4) => {
+ aarch64_128_neon_impl!(m32x4, vminvq_u32, vmaxvq_u32);
+ };
+ // Fallback to LLVM's default code-generation:
+ ($id:ident) => {
+ fallback_impl!($id);
+ };
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/reductions/mask/arm.rs b/third_party/rust/packed_simd_2/src/codegen/reductions/mask/arm.rs
new file mode 100644
index 0000000000..41c3cbc58a
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/reductions/mask/arm.rs
@@ -0,0 +1,56 @@
+//! Mask reductions implementation for `arm` targets
+
+/// Implementation for ARM + v7 + NEON for 64-bit or 128-bit wide vectors with
+/// more than two elements.
+macro_rules! arm_128_v7_neon_impl {
+ ($id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => {
+ impl All for $id {
+ #[inline]
+ #[target_feature(enable = "v7,neon")]
+ unsafe fn all(self) -> bool {
+ use crate::arch::arm::$vpmin;
+ use crate::mem::transmute;
+ union U {
+ halves: ($half, $half),
+ vec: $id,
+ }
+ let halves = U { vec: self }.halves;
+ let h: $half = transmute($vpmin(transmute(halves.0), transmute(halves.1)));
+ h.all()
+ }
+ }
+ impl Any for $id {
+ #[inline]
+ #[target_feature(enable = "v7,neon")]
+ unsafe fn any(self) -> bool {
+ use crate::arch::arm::$vpmax;
+ use crate::mem::transmute;
+ union U {
+ halves: ($half, $half),
+ vec: $id,
+ }
+ let halves = U { vec: self }.halves;
+ let h: $half = transmute($vpmax(transmute(halves.0), transmute(halves.1)));
+ h.any()
+ }
+ }
+ };
+}
+
+/// Mask reduction implementation for `arm` targets
+macro_rules! impl_mask_reductions {
+ // 128-bit wide masks
+ (m8x16) => {
+ arm_128_v7_neon_impl!(m8x16, m8x8, vpmin_u8, vpmax_u8);
+ };
+ (m16x8) => {
+ arm_128_v7_neon_impl!(m16x8, m16x4, vpmin_u16, vpmax_u16);
+ };
+ (m32x4) => {
+ arm_128_v7_neon_impl!(m32x4, m32x2, vpmin_u32, vpmax_u32);
+ };
+ // Fallback to LLVM's default code-generation:
+ ($id:ident) => {
+ fallback_impl!($id);
+ };
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/reductions/mask/fallback.rs b/third_party/rust/packed_simd_2/src/codegen/reductions/mask/fallback.rs
new file mode 100644
index 0000000000..4c377a6878
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/reductions/mask/fallback.rs
@@ -0,0 +1,8 @@
+//! Default mask reduction implementations.
+
+/// Default mask reduction implementation
+macro_rules! impl_mask_reductions {
+ ($id:ident) => {
+ fallback_impl!($id);
+ };
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/reductions/mask/fallback_impl.rs b/third_party/rust/packed_simd_2/src/codegen/reductions/mask/fallback_impl.rs
new file mode 100644
index 0000000000..0d246e2fda
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/reductions/mask/fallback_impl.rs
@@ -0,0 +1,237 @@
+//! Default implementation of a mask reduction for any target.
+
+macro_rules! fallback_to_other_impl {
+ ($id:ident, $other:ident) => {
+ impl All for $id {
+ #[inline]
+ unsafe fn all(self) -> bool {
+ let m: $other = crate::mem::transmute(self);
+ m.all()
+ }
+ }
+ impl Any for $id {
+ #[inline]
+ unsafe fn any(self) -> bool {
+ let m: $other = crate::mem::transmute(self);
+ m.any()
+ }
+ }
+ };
+}
+
+/// Fallback implementation.
+macro_rules! fallback_impl {
+ // 16-bit wide masks:
+ (m8x2) => {
+ impl All for m8x2 {
+ #[inline]
+ unsafe fn all(self) -> bool {
+ let i: u16 = crate::mem::transmute(self);
+ i == u16::max_value()
+ }
+ }
+ impl Any for m8x2 {
+ #[inline]
+ unsafe fn any(self) -> bool {
+ let i: u16 = crate::mem::transmute(self);
+ i != 0
+ }
+ }
+ };
+ // 32-bit wide masks
+ (m8x4) => {
+ impl All for m8x4 {
+ #[inline]
+ unsafe fn all(self) -> bool {
+ let i: u32 = crate::mem::transmute(self);
+ i == u32::max_value()
+ }
+ }
+ impl Any for m8x4 {
+ #[inline]
+ unsafe fn any(self) -> bool {
+ let i: u32 = crate::mem::transmute(self);
+ i != 0
+ }
+ }
+ };
+ (m16x2) => {
+ fallback_to_other_impl!(m16x2, m8x4);
+ };
+ // 64-bit wide masks:
+ (m8x8) => {
+ impl All for m8x8 {
+ #[inline]
+ unsafe fn all(self) -> bool {
+ let i: u64 = crate::mem::transmute(self);
+ i == u64::max_value()
+ }
+ }
+ impl Any for m8x8 {
+ #[inline]
+ unsafe fn any(self) -> bool {
+ let i: u64 = crate::mem::transmute(self);
+ i != 0
+ }
+ }
+ };
+ (m16x4) => {
+ fallback_to_other_impl!(m16x4, m8x8);
+ };
+ (m32x2) => {
+ fallback_to_other_impl!(m32x2, m16x4);
+ };
+ // FIXME: 64x1 maxk
+ // 128-bit wide masks:
+ (m8x16) => {
+ impl All for m8x16 {
+ #[inline]
+ unsafe fn all(self) -> bool {
+ let i: u128 = crate::mem::transmute(self);
+ i == u128::max_value()
+ }
+ }
+ impl Any for m8x16 {
+ #[inline]
+ unsafe fn any(self) -> bool {
+ let i: u128 = crate::mem::transmute(self);
+ i != 0
+ }
+ }
+ };
+ (m16x8) => {
+ fallback_to_other_impl!(m16x8, m8x16);
+ };
+ (m32x4) => {
+ fallback_to_other_impl!(m32x4, m16x8);
+ };
+ (m64x2) => {
+ fallback_to_other_impl!(m64x2, m32x4);
+ };
+ (m128x1) => {
+ fallback_to_other_impl!(m128x1, m64x2);
+ };
+ // 256-bit wide masks
+ (m8x32) => {
+ impl All for m8x32 {
+ #[inline]
+ unsafe fn all(self) -> bool {
+ let i: [u128; 2] = crate::mem::transmute(self);
+ let o: [u128; 2] = [u128::max_value(); 2];
+ i == o
+ }
+ }
+ impl Any for m8x32 {
+ #[inline]
+ unsafe fn any(self) -> bool {
+ let i: [u128; 2] = crate::mem::transmute(self);
+ let o: [u128; 2] = [0; 2];
+ i != o
+ }
+ }
+ };
+ (m16x16) => {
+ fallback_to_other_impl!(m16x16, m8x32);
+ };
+ (m32x8) => {
+ fallback_to_other_impl!(m32x8, m16x16);
+ };
+ (m64x4) => {
+ fallback_to_other_impl!(m64x4, m32x8);
+ };
+ (m128x2) => {
+ fallback_to_other_impl!(m128x2, m64x4);
+ };
+ // 512-bit wide masks
+ (m8x64) => {
+ impl All for m8x64 {
+ #[inline]
+ unsafe fn all(self) -> bool {
+ let i: [u128; 4] = crate::mem::transmute(self);
+ let o: [u128; 4] = [u128::max_value(); 4];
+ i == o
+ }
+ }
+ impl Any for m8x64 {
+ #[inline]
+ unsafe fn any(self) -> bool {
+ let i: [u128; 4] = crate::mem::transmute(self);
+ let o: [u128; 4] = [0; 4];
+ i != o
+ }
+ }
+ };
+ (m16x32) => {
+ fallback_to_other_impl!(m16x32, m8x64);
+ };
+ (m32x16) => {
+ fallback_to_other_impl!(m32x16, m16x32);
+ };
+ (m64x8) => {
+ fallback_to_other_impl!(m64x8, m32x16);
+ };
+ (m128x4) => {
+ fallback_to_other_impl!(m128x4, m64x8);
+ };
+ // Masks with pointer-sized elements64
+ (msizex2) => {
+ cfg_if! {
+ if #[cfg(target_pointer_width = "64")] {
+ fallback_to_other_impl!(msizex2, m64x2);
+ } else if #[cfg(target_pointer_width = "32")] {
+ fallback_to_other_impl!(msizex2, m32x2);
+ } else {
+ compile_error!("unsupported target_pointer_width");
+ }
+ }
+ };
+ (msizex4) => {
+ cfg_if! {
+ if #[cfg(target_pointer_width = "64")] {
+ fallback_to_other_impl!(msizex4, m64x4);
+ } else if #[cfg(target_pointer_width = "32")] {
+ fallback_to_other_impl!(msizex4, m32x4);
+ } else {
+ compile_error!("unsupported target_pointer_width");
+ }
+ }
+ };
+ (msizex8) => {
+ cfg_if! {
+ if #[cfg(target_pointer_width = "64")] {
+ fallback_to_other_impl!(msizex8, m64x8);
+ } else if #[cfg(target_pointer_width = "32")] {
+ fallback_to_other_impl!(msizex8, m32x8);
+ } else {
+ compile_error!("unsupported target_pointer_width");
+ }
+ }
+ };
+}
+
+macro_rules! recurse_half {
+ ($vid:ident, $vid_h:ident) => {
+ impl All for $vid {
+ #[inline]
+ unsafe fn all(self) -> bool {
+ union U {
+ halves: ($vid_h, $vid_h),
+ vec: $vid,
+ }
+ let halves = U { vec: self }.halves;
+ halves.0.all() && halves.1.all()
+ }
+ }
+ impl Any for $vid {
+ #[inline]
+ unsafe fn any(self) -> bool {
+ union U {
+ halves: ($vid_h, $vid_h),
+ vec: $vid,
+ }
+ let halves = U { vec: self }.halves;
+ halves.0.any() || halves.1.any()
+ }
+ }
+ };
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86.rs b/third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86.rs
new file mode 100644
index 0000000000..4bf5098065
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86.rs
@@ -0,0 +1,216 @@
+//! Mask reductions implementation for `x86` and `x86_64` targets
+
+#[cfg(target_feature = "sse")]
+#[macro_use]
+mod sse;
+
+#[cfg(target_feature = "sse2")]
+#[macro_use]
+mod sse2;
+
+#[cfg(target_feature = "avx")]
+#[macro_use]
+mod avx;
+
+#[cfg(target_feature = "avx2")]
+#[macro_use]
+mod avx2;
+
+/// x86 64-bit m8x8 implementation
+macro_rules! x86_m8x8_impl {
+ ($id:ident) => {
+ fallback_impl!($id);
+ };
+}
+
+/// x86 128-bit m8x16 implementation
+macro_rules! x86_m8x16_impl {
+ ($id:ident) => {
+ cfg_if! {
+ if #[cfg(target_feature = "sse2")] {
+ x86_m8x16_sse2_impl!($id);
+ } else {
+ fallback_impl!($id);
+ }
+ }
+ };
+}
+
+/// x86 128-bit m32x4 implementation
+macro_rules! x86_m32x4_impl {
+ ($id:ident) => {
+ cfg_if! {
+ if #[cfg(target_feature = "sse")] {
+ x86_m32x4_sse_impl!($id);
+ } else {
+ fallback_impl!($id);
+ }
+ }
+ };
+}
+
+/// x86 128-bit m64x2 implementation
+macro_rules! x86_m64x2_impl {
+ ($id:ident) => {
+ cfg_if! {
+ if #[cfg(target_feature = "sse2")] {
+ x86_m64x2_sse2_impl!($id);
+ } else if #[cfg(target_feature = "sse")] {
+ x86_m32x4_sse_impl!($id);
+ } else {
+ fallback_impl!($id);
+ }
+ }
+ };
+}
+
+/// x86 256-bit m8x32 implementation
+macro_rules! x86_m8x32_impl {
+ ($id:ident, $half_id:ident) => {
+ cfg_if! {
+ if #[cfg(target_feature = "avx2")] {
+ x86_m8x32_avx2_impl!($id);
+ } else if #[cfg(target_feature = "avx")] {
+ x86_m8x32_avx_impl!($id);
+ } else if #[cfg(target_feature = "sse2")] {
+ recurse_half!($id, $half_id);
+ } else {
+ fallback_impl!($id);
+ }
+ }
+ };
+}
+
+/// x86 256-bit m32x8 implementation
+macro_rules! x86_m32x8_impl {
+ ($id:ident, $half_id:ident) => {
+ cfg_if! {
+ if #[cfg(target_feature = "avx")] {
+ x86_m32x8_avx_impl!($id);
+ } else if #[cfg(target_feature = "sse")] {
+ recurse_half!($id, $half_id);
+ } else {
+ fallback_impl!($id);
+ }
+ }
+ };
+}
+
+/// x86 256-bit m64x4 implementation
+macro_rules! x86_m64x4_impl {
+ ($id:ident, $half_id:ident) => {
+ cfg_if! {
+ if #[cfg(target_feature = "avx")] {
+ x86_m64x4_avx_impl!($id);
+ } else if #[cfg(target_feature = "sse")] {
+ recurse_half!($id, $half_id);
+ } else {
+ fallback_impl!($id);
+ }
+ }
+ };
+}
+
+/// Fallback implementation.
+macro_rules! x86_intr_impl {
+ ($id:ident) => {
+ impl All for $id {
+ #[inline]
+ unsafe fn all(self) -> bool {
+ use crate::llvm::simd_reduce_all;
+ simd_reduce_all(self.0)
+ }
+ }
+ impl Any for $id {
+ #[inline]
+ unsafe fn any(self) -> bool {
+ use crate::llvm::simd_reduce_any;
+ simd_reduce_any(self.0)
+ }
+ }
+ };
+}
+
+/// Mask reduction implementation for `x86` and `x86_64` targets
+macro_rules! impl_mask_reductions {
+ // 64-bit wide masks
+ (m8x8) => {
+ x86_m8x8_impl!(m8x8);
+ };
+ (m16x4) => {
+ x86_m8x8_impl!(m16x4);
+ };
+ (m32x2) => {
+ x86_m8x8_impl!(m32x2);
+ };
+ // 128-bit wide masks
+ (m8x16) => {
+ x86_m8x16_impl!(m8x16);
+ };
+ (m16x8) => {
+ x86_m8x16_impl!(m16x8);
+ };
+ (m32x4) => {
+ x86_m32x4_impl!(m32x4);
+ };
+ (m64x2) => {
+ x86_m64x2_impl!(m64x2);
+ };
+ (m128x1) => {
+ x86_intr_impl!(m128x1);
+ };
+ // 256-bit wide masks:
+ (m8x32) => {
+ x86_m8x32_impl!(m8x32, m8x16);
+ };
+ (m16x16) => {
+ x86_m8x32_impl!(m16x16, m16x8);
+ };
+ (m32x8) => {
+ x86_m32x8_impl!(m32x8, m32x4);
+ };
+ (m64x4) => {
+ x86_m64x4_impl!(m64x4, m64x2);
+ };
+ (m128x2) => {
+ x86_intr_impl!(m128x2);
+ };
+ (msizex2) => {
+ cfg_if! {
+ if #[cfg(target_pointer_width = "64")] {
+ fallback_to_other_impl!(msizex2, m64x2);
+ } else if #[cfg(target_pointer_width = "32")] {
+ fallback_to_other_impl!(msizex2, m32x2);
+ } else {
+ compile_error!("unsupported target_pointer_width");
+ }
+ }
+ };
+ (msizex4) => {
+ cfg_if! {
+ if #[cfg(target_pointer_width = "64")] {
+ fallback_to_other_impl!(msizex4, m64x4);
+ } else if #[cfg(target_pointer_width = "32")] {
+ fallback_to_other_impl!(msizex4, m32x4);
+ } else {
+ compile_error!("unsupported target_pointer_width");
+ }
+ }
+ };
+ (msizex8) => {
+ cfg_if! {
+ if #[cfg(target_pointer_width = "64")] {
+ fallback_to_other_impl!(msizex8, m64x8);
+ } else if #[cfg(target_pointer_width = "32")] {
+ fallback_to_other_impl!(msizex8, m32x8);
+ } else {
+ compile_error!("unsupported target_pointer_width");
+ }
+ }
+ };
+
+ // Fallback to LLVM's default code-generation:
+ ($id:ident) => {
+ fallback_impl!($id);
+ };
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86/avx.rs b/third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86/avx.rs
new file mode 100644
index 0000000000..61f352d228
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86/avx.rs
@@ -0,0 +1,95 @@
+//! Mask reductions implementation for `x86` and `x86_64` targets with `AVX`
+
+/// `x86`/`x86_64` 256-bit `AVX` implementation
+/// FIXME: it might be faster here to do two `_mm_movmask_epi8`
+#[cfg(target_feature = "avx")]
+macro_rules! x86_m8x32_avx_impl {
+ ($id:ident) => {
+ impl All for $id {
+ #[inline]
+ #[target_feature(enable = "avx")]
+ unsafe fn all(self) -> bool {
+ #[cfg(target_arch = "x86")]
+ use crate::arch::x86::_mm256_testc_si256;
+ #[cfg(target_arch = "x86_64")]
+ use crate::arch::x86_64::_mm256_testc_si256;
+ _mm256_testc_si256(crate::mem::transmute(self), crate::mem::transmute($id::splat(true))) != 0
+ }
+ }
+ impl Any for $id {
+ #[inline]
+ #[target_feature(enable = "avx")]
+ unsafe fn any(self) -> bool {
+ #[cfg(target_arch = "x86")]
+ use crate::arch::x86::_mm256_testz_si256;
+ #[cfg(target_arch = "x86_64")]
+ use crate::arch::x86_64::_mm256_testz_si256;
+ _mm256_testz_si256(crate::mem::transmute(self), crate::mem::transmute(self)) == 0
+ }
+ }
+ };
+}
+
+/// `x86`/`x86_64` 256-bit m32x8 `AVX` implementation
+macro_rules! x86_m32x8_avx_impl {
+ ($id:ident) => {
+ impl All for $id {
+ #[inline]
+ #[target_feature(enable = "sse")]
+ unsafe fn all(self) -> bool {
+ #[cfg(target_arch = "x86")]
+ use crate::arch::x86::_mm256_movemask_ps;
+ #[cfg(target_arch = "x86_64")]
+ use crate::arch::x86_64::_mm256_movemask_ps;
+ // _mm256_movemask_ps(a) creates a 8bit mask containing the
+ // most significant bit of each lane of `a`. If all bits are
+ // set, then all 8 lanes of the mask are true.
+ _mm256_movemask_ps(crate::mem::transmute(self)) == 0b_1111_1111_i32
+ }
+ }
+ impl Any for $id {
+ #[inline]
+ #[target_feature(enable = "sse")]
+ unsafe fn any(self) -> bool {
+ #[cfg(target_arch = "x86")]
+ use crate::arch::x86::_mm256_movemask_ps;
+ #[cfg(target_arch = "x86_64")]
+ use crate::arch::x86_64::_mm256_movemask_ps;
+
+ _mm256_movemask_ps(crate::mem::transmute(self)) != 0
+ }
+ }
+ };
+}
+
+/// `x86`/`x86_64` 256-bit m64x4 `AVX` implementation
+macro_rules! x86_m64x4_avx_impl {
+ ($id:ident) => {
+ impl All for $id {
+ #[inline]
+ #[target_feature(enable = "sse")]
+ unsafe fn all(self) -> bool {
+ #[cfg(target_arch = "x86")]
+ use crate::arch::x86::_mm256_movemask_pd;
+ #[cfg(target_arch = "x86_64")]
+ use crate::arch::x86_64::_mm256_movemask_pd;
+ // _mm256_movemask_pd(a) creates a 4bit mask containing the
+ // most significant bit of each lane of `a`. If all bits are
+ // set, then all 4 lanes of the mask are true.
+ _mm256_movemask_pd(crate::mem::transmute(self)) == 0b_1111_i32
+ }
+ }
+ impl Any for $id {
+ #[inline]
+ #[target_feature(enable = "sse")]
+ unsafe fn any(self) -> bool {
+ #[cfg(target_arch = "x86")]
+ use crate::arch::x86::_mm256_movemask_pd;
+ #[cfg(target_arch = "x86_64")]
+ use crate::arch::x86_64::_mm256_movemask_pd;
+
+ _mm256_movemask_pd(crate::mem::transmute(self)) != 0
+ }
+ }
+ };
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86/avx2.rs b/third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86/avx2.rs
new file mode 100644
index 0000000000..d37d023420
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86/avx2.rs
@@ -0,0 +1,35 @@
+//! Mask reductions implementation for `x86` and `x86_64` targets with `AVX2`.
+#![allow(unused)]
+
+/// x86/x86_64 256-bit m8x32 AVX2 implementation
+macro_rules! x86_m8x32_avx2_impl {
+ ($id:ident) => {
+ impl All for $id {
+ #[inline]
+ #[target_feature(enable = "sse2")]
+ unsafe fn all(self) -> bool {
+ #[cfg(target_arch = "x86")]
+ use crate::arch::x86::_mm256_movemask_epi8;
+ #[cfg(target_arch = "x86_64")]
+ use crate::arch::x86_64::_mm256_movemask_epi8;
+ // _mm256_movemask_epi8(a) creates a 32bit mask containing the
+ // most significant bit of each byte of `a`. If all
+ // bits are set, then all 32 lanes of the mask are
+ // true.
+ _mm256_movemask_epi8(crate::mem::transmute(self)) == -1_i32
+ }
+ }
+ impl Any for $id {
+ #[inline]
+ #[target_feature(enable = "sse2")]
+ unsafe fn any(self) -> bool {
+ #[cfg(target_arch = "x86")]
+ use crate::arch::x86::_mm256_movemask_epi8;
+ #[cfg(target_arch = "x86_64")]
+ use crate::arch::x86_64::_mm256_movemask_epi8;
+
+ _mm256_movemask_epi8(crate::mem::transmute(self)) != 0
+ }
+ }
+ };
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86/sse.rs b/third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86/sse.rs
new file mode 100644
index 0000000000..e0c9aee92b
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86/sse.rs
@@ -0,0 +1,35 @@
+//! Mask reductions implementation for `x86` and `x86_64` targets with `SSE`.
+#![allow(unused)]
+
+/// `x86`/`x86_64` 128-bit `m32x4` `SSE` implementation
+macro_rules! x86_m32x4_sse_impl {
+ ($id:ident) => {
+ impl All for $id {
+ #[inline]
+ #[target_feature(enable = "sse")]
+ unsafe fn all(self) -> bool {
+ #[cfg(target_arch = "x86")]
+ use crate::arch::x86::_mm_movemask_ps;
+ #[cfg(target_arch = "x86_64")]
+ use crate::arch::x86_64::_mm_movemask_ps;
+ // _mm_movemask_ps(a) creates a 4bit mask containing the
+ // most significant bit of each lane of `a`. If all
+ // bits are set, then all 4 lanes of the mask are
+ // true.
+ _mm_movemask_ps(crate::mem::transmute(self)) == 0b_1111_i32
+ }
+ }
+ impl Any for $id {
+ #[inline]
+ #[target_feature(enable = "sse")]
+ unsafe fn any(self) -> bool {
+ #[cfg(target_arch = "x86")]
+ use crate::arch::x86::_mm_movemask_ps;
+ #[cfg(target_arch = "x86_64")]
+ use crate::arch::x86_64::_mm_movemask_ps;
+
+ _mm_movemask_ps(crate::mem::transmute(self)) != 0
+ }
+ }
+ };
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86/sse2.rs b/third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86/sse2.rs
new file mode 100644
index 0000000000..bbb52fa47e
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/reductions/mask/x86/sse2.rs
@@ -0,0 +1,68 @@
+//! Mask reductions implementation for `x86` and `x86_64` targets with `SSE2`.
+#![allow(unused)]
+
+/// `x86`/`x86_64` 128-bit m64x2 `SSE2` implementation
+macro_rules! x86_m64x2_sse2_impl {
+ ($id:ident) => {
+ impl All for $id {
+ #[inline]
+ #[target_feature(enable = "sse")]
+ unsafe fn all(self) -> bool {
+ #[cfg(target_arch = "x86")]
+ use crate::arch::x86::_mm_movemask_pd;
+ #[cfg(target_arch = "x86_64")]
+ use crate::arch::x86_64::_mm_movemask_pd;
+ // _mm_movemask_pd(a) creates a 2bit mask containing the
+ // most significant bit of each lane of `a`. If all
+ // bits are set, then all 2 lanes of the mask are
+ // true.
+ _mm_movemask_pd(crate::mem::transmute(self)) == 0b_11_i32
+ }
+ }
+ impl Any for $id {
+ #[inline]
+ #[target_feature(enable = "sse")]
+ unsafe fn any(self) -> bool {
+ #[cfg(target_arch = "x86")]
+ use crate::arch::x86::_mm_movemask_pd;
+ #[cfg(target_arch = "x86_64")]
+ use crate::arch::x86_64::_mm_movemask_pd;
+
+ _mm_movemask_pd(crate::mem::transmute(self)) != 0
+ }
+ }
+ };
+}
+
+/// `x86`/`x86_64` 128-bit m8x16 `SSE2` implementation
+macro_rules! x86_m8x16_sse2_impl {
+ ($id:ident) => {
+ impl All for $id {
+ #[inline]
+ #[target_feature(enable = "sse2")]
+ unsafe fn all(self) -> bool {
+ #[cfg(target_arch = "x86")]
+ use crate::arch::x86::_mm_movemask_epi8;
+ #[cfg(target_arch = "x86_64")]
+ use crate::arch::x86_64::_mm_movemask_epi8;
+ // _mm_movemask_epi8(a) creates a 16bit mask containing the
+ // most significant bit of each byte of `a`. If all
+ // bits are set, then all 16 lanes of the mask are
+ // true.
+ _mm_movemask_epi8(crate::mem::transmute(self)) == i32::from(u16::max_value())
+ }
+ }
+ impl Any for $id {
+ #[inline]
+ #[target_feature(enable = "sse2")]
+ unsafe fn any(self) -> bool {
+ #[cfg(target_arch = "x86")]
+ use crate::arch::x86::_mm_movemask_epi8;
+ #[cfg(target_arch = "x86_64")]
+ use crate::arch::x86_64::_mm_movemask_epi8;
+
+ _mm_movemask_epi8(crate::mem::transmute(self)) != 0
+ }
+ }
+ };
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/shuffle.rs b/third_party/rust/packed_simd_2/src/codegen/shuffle.rs
new file mode 100644
index 0000000000..d3acd48f5b
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/shuffle.rs
@@ -0,0 +1,150 @@
+//! Implementations of the `ShuffleResult` trait for the different numbers of
+//! lanes and vector element types.
+
+use crate::masks::*;
+use crate::sealed::{Seal, Shuffle};
+
+macro_rules! impl_shuffle {
+ ($array:ty, $base:ty, $out:ty) => {
+ impl Seal<$array> for $base {}
+ impl Shuffle<$array> for $base {
+ type Output = $out;
+ }
+ };
+}
+
+impl_shuffle! { [u32; 2], i8, crate::codegen::i8x2 }
+impl_shuffle! { [u32; 4], i8, crate::codegen::i8x4 }
+impl_shuffle! { [u32; 8], i8, crate::codegen::i8x8 }
+impl_shuffle! { [u32; 16], i8, crate::codegen::i8x16 }
+impl_shuffle! { [u32; 32], i8, crate::codegen::i8x32 }
+impl_shuffle! { [u32; 64], i8, crate::codegen::i8x64 }
+
+impl_shuffle! { [u32; 2], u8, crate::codegen::u8x2 }
+impl_shuffle! { [u32; 4], u8, crate::codegen::u8x4 }
+impl_shuffle! { [u32; 8], u8, crate::codegen::u8x8 }
+impl_shuffle! { [u32; 16], u8, crate::codegen::u8x16 }
+impl_shuffle! { [u32; 32], u8, crate::codegen::u8x32 }
+impl_shuffle! { [u32; 64], u8, crate::codegen::u8x64 }
+
+impl_shuffle! { [u32; 2], m8, crate::codegen::m8x2 }
+impl_shuffle! { [u32; 4], m8, crate::codegen::m8x4 }
+impl_shuffle! { [u32; 8], m8, crate::codegen::m8x8 }
+impl_shuffle! { [u32; 16], m8, crate::codegen::m8x16 }
+impl_shuffle! { [u32; 32], m8, crate::codegen::m8x32 }
+impl_shuffle! { [u32; 64], m8, crate::codegen::m8x64 }
+
+impl_shuffle! { [u32; 2], i16, crate::codegen::i16x2 }
+impl_shuffle! { [u32; 4], i16, crate::codegen::i16x4 }
+impl_shuffle! { [u32; 8], i16, crate::codegen::i16x8 }
+impl_shuffle! { [u32; 16], i16, crate::codegen::i16x16 }
+impl_shuffle! { [u32; 32], i16, crate::codegen::i16x32 }
+
+impl_shuffle! { [u32; 2], u16, crate::codegen::u16x2 }
+impl_shuffle! { [u32; 4], u16, crate::codegen::u16x4 }
+impl_shuffle! { [u32; 8], u16, crate::codegen::u16x8 }
+impl_shuffle! { [u32; 16], u16, crate::codegen::u16x16 }
+impl_shuffle! { [u32; 32], u16, crate::codegen::u16x32 }
+
+impl_shuffle! { [u32; 2], m16, crate::codegen::m16x2 }
+impl_shuffle! { [u32; 4], m16, crate::codegen::m16x4 }
+impl_shuffle! { [u32; 8], m16, crate::codegen::m16x8 }
+impl_shuffle! { [u32; 16], m16, crate::codegen::m16x16 }
+
+impl_shuffle! { [u32; 2], i32, crate::codegen::i32x2 }
+impl_shuffle! { [u32; 4], i32, crate::codegen::i32x4 }
+impl_shuffle! { [u32; 8], i32, crate::codegen::i32x8 }
+impl_shuffle! { [u32; 16], i32, crate::codegen::i32x16 }
+
+impl_shuffle! { [u32; 2], u32, crate::codegen::u32x2 }
+impl_shuffle! { [u32; 4], u32, crate::codegen::u32x4 }
+impl_shuffle! { [u32; 8], u32, crate::codegen::u32x8 }
+impl_shuffle! { [u32; 16], u32, crate::codegen::u32x16 }
+
+impl_shuffle! { [u32; 2], f32, crate::codegen::f32x2 }
+impl_shuffle! { [u32; 4], f32, crate::codegen::f32x4 }
+impl_shuffle! { [u32; 8], f32, crate::codegen::f32x8 }
+impl_shuffle! { [u32; 16], f32, crate::codegen::f32x16 }
+
+impl_shuffle! { [u32; 2], m32, crate::codegen::m32x2 }
+impl_shuffle! { [u32; 4], m32, crate::codegen::m32x4 }
+impl_shuffle! { [u32; 8], m32, crate::codegen::m32x8 }
+impl_shuffle! { [u32; 16], m32, crate::codegen::m32x16 }
+
+/* FIXME: 64-bit single element vector
+impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 }
+*/
+impl_shuffle! { [u32; 2], i64, crate::codegen::i64x2 }
+impl_shuffle! { [u32; 4], i64, crate::codegen::i64x4 }
+impl_shuffle! { [u32; 8], i64, crate::codegen::i64x8 }
+
+/* FIXME: 64-bit single element vector
+impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 }
+*/
+impl_shuffle! { [u32; 2], u64, crate::codegen::u64x2 }
+impl_shuffle! { [u32; 4], u64, crate::codegen::u64x4 }
+impl_shuffle! { [u32; 8], u64, crate::codegen::u64x8 }
+
+/* FIXME: 64-bit single element vector
+impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 }
+*/
+impl_shuffle! { [u32; 2], f64, crate::codegen::f64x2 }
+impl_shuffle! { [u32; 4], f64, crate::codegen::f64x4 }
+impl_shuffle! { [u32; 8], f64, crate::codegen::f64x8 }
+
+/* FIXME: 64-bit single element vector
+impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 }
+*/
+impl_shuffle! { [u32; 2], m64, crate::codegen::m64x2 }
+impl_shuffle! { [u32; 4], m64, crate::codegen::m64x4 }
+impl_shuffle! { [u32; 8], m64, crate::codegen::m64x8 }
+
+impl_shuffle! { [u32; 2], isize, crate::codegen::isizex2 }
+impl_shuffle! { [u32; 4], isize, crate::codegen::isizex4 }
+impl_shuffle! { [u32; 8], isize, crate::codegen::isizex8 }
+
+impl_shuffle! { [u32; 2], usize, crate::codegen::usizex2 }
+impl_shuffle! { [u32; 4], usize, crate::codegen::usizex4 }
+impl_shuffle! { [u32; 8], usize, crate::codegen::usizex8 }
+
+impl_shuffle! { [u32; 2], msize, crate::codegen::msizex2 }
+impl_shuffle! { [u32; 4], msize, crate::codegen::msizex4 }
+impl_shuffle! { [u32; 8], msize, crate::codegen::msizex8 }
+
+impl<T> Seal<[u32; 2]> for *const T {}
+impl<T> Shuffle<[u32; 2]> for *const T {
+ type Output = crate::codegen::cptrx2<T>;
+}
+impl<T> Seal<[u32; 4]> for *const T {}
+impl<T> Shuffle<[u32; 4]> for *const T {
+ type Output = crate::codegen::cptrx4<T>;
+}
+impl<T> Seal<[u32; 8]> for *const T {}
+impl<T> Shuffle<[u32; 8]> for *const T {
+ type Output = crate::codegen::cptrx8<T>;
+}
+
+impl<T> Seal<[u32; 2]> for *mut T {}
+impl<T> Shuffle<[u32; 2]> for *mut T {
+ type Output = crate::codegen::mptrx2<T>;
+}
+impl<T> Seal<[u32; 4]> for *mut T {}
+impl<T> Shuffle<[u32; 4]> for *mut T {
+ type Output = crate::codegen::mptrx4<T>;
+}
+impl<T> Seal<[u32; 8]> for *mut T {}
+impl<T> Shuffle<[u32; 8]> for *mut T {
+ type Output = crate::codegen::mptrx8<T>;
+}
+
+impl_shuffle! { [u32; 1], i128, crate::codegen::i128x1 }
+impl_shuffle! { [u32; 2], i128, crate::codegen::i128x2 }
+impl_shuffle! { [u32; 4], i128, crate::codegen::i128x4 }
+
+impl_shuffle! { [u32; 1], u128, crate::codegen::u128x1 }
+impl_shuffle! { [u32; 2], u128, crate::codegen::u128x2 }
+impl_shuffle! { [u32; 4], u128, crate::codegen::u128x4 }
+
+impl_shuffle! { [u32; 1], m128, crate::codegen::m128x1 }
+impl_shuffle! { [u32; 2], m128, crate::codegen::m128x2 }
+impl_shuffle! { [u32; 4], m128, crate::codegen::m128x4 }
diff --git a/third_party/rust/packed_simd_2/src/codegen/shuffle1_dyn.rs b/third_party/rust/packed_simd_2/src/codegen/shuffle1_dyn.rs
new file mode 100644
index 0000000000..19d457a45b
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/shuffle1_dyn.rs
@@ -0,0 +1,408 @@
+//! Shuffle vector lanes with run-time indices.
+
+use crate::*;
+
+pub trait Shuffle1Dyn {
+ type Indices;
+ fn shuffle1_dyn(self, _: Self::Indices) -> Self;
+}
+
+// Fallback implementation
+macro_rules! impl_fallback {
+ ($id:ident) => {
+ impl Shuffle1Dyn for $id {
+ type Indices = Self;
+ #[inline]
+ fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+ let mut result = Self::splat(0);
+ for i in 0..$id::lanes() {
+ result = result.replace(i, self.extract(indices.extract(i) as usize));
+ }
+ result
+ }
+ }
+ };
+}
+
+macro_rules! impl_shuffle1_dyn {
+ (u8x8) => {
+ cfg_if! {
+ if #[cfg(all(
+ any(
+ all(target_arch = "aarch64", target_feature = "neon"),
+ all(target_arch = "doesnotexist", target_feature = "v7",
+ target_feature = "neon")
+ ),
+ any(feature = "core_arch", libcore_neon)
+ )
+ )] {
+ impl Shuffle1Dyn for u8x8 {
+ type Indices = Self;
+ #[inline]
+ fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+ #[cfg(target_arch = "aarch64")]
+ use crate::arch::aarch64::vtbl1_u8;
+ #[cfg(target_arch = "doesnotexist")]
+ use crate::arch::arm::vtbl1_u8;
+
+ // This is safe because the binary is compiled with
+ // neon enabled at compile-time and can therefore only
+ // run on CPUs that have it enabled.
+ unsafe {
+ Simd(mem::transmute(
+ vtbl1_u8(mem::transmute(self.0),
+ crate::mem::transmute(indices.0))
+ ))
+ }
+ }
+ }
+ } else {
+ impl_fallback!(u8x8);
+ }
+ }
+ };
+ (u8x16) => {
+ cfg_if! {
+ if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
+ target_feature = "ssse3"))] {
+ impl Shuffle1Dyn for u8x16 {
+ type Indices = Self;
+ #[inline]
+ fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+ #[cfg(target_arch = "x86")]
+ use crate::arch::x86::_mm_shuffle_epi8;
+ #[cfg(target_arch = "x86_64")]
+ use crate::arch::x86_64::_mm_shuffle_epi8;
+ // This is safe because the binary is compiled with
+ // ssse3 enabled at compile-time and can therefore only
+ // run on CPUs that have it enabled.
+ unsafe {
+ Simd(mem::transmute(
+ _mm_shuffle_epi8(mem::transmute(self.0),
+ crate::mem::transmute(indices))
+ ))
+ }
+ }
+ }
+ } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon",
+ any(feature = "core_arch", libcore_neon)))] {
+ impl Shuffle1Dyn for u8x16 {
+ type Indices = Self;
+ #[inline]
+ fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+ use crate::arch::aarch64::vqtbl1q_u8;
+
+ // This is safe because the binary is compiled with
+ // neon enabled at compile-time and can therefore only
+ // run on CPUs that have it enabled.
+ unsafe {
+ Simd(mem::transmute(
+ vqtbl1q_u8(mem::transmute(self.0),
+ crate::mem::transmute(indices.0))
+ ))
+ }
+ }
+ }
+ } else if #[cfg(all(target_arch = "doesnotexist", target_feature = "v7",
+ target_feature = "neon",
+ any(feature = "core_arch", libcore_neon)))] {
+ impl Shuffle1Dyn for u8x16 {
+ type Indices = Self;
+ #[inline]
+ fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+ use crate::arch::arm::vtbl2_u8;
+
+ // This is safe because the binary is compiled with
+ // neon enabled at compile-time and can therefore only
+ // run on CPUs that have it enabled.
+ unsafe {
+ union U {
+ j: u8x16,
+ s: (u8x8, u8x8),
+ }
+
+ let (i0, i1) = U { j: y }.s;
+
+ let r0 = vtbl2_u8(
+ mem::transmute(x),
+ crate::mem::transmute(i0)
+ );
+ let r1 = vtbl2_u8(
+ mem::transmute(x),
+ crate::mem::transmute(i1)
+ );
+
+ let r = U { s: (r0, r1) }.j;
+
+ Simd(mem::transmute(r))
+ }
+ }
+ }
+ } else {
+ impl_fallback!(u8x16);
+ }
+ }
+ };
+ (u16x8) => {
+ impl Shuffle1Dyn for u16x8 {
+ type Indices = Self;
+ #[inline]
+ fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+ let indices: u8x8 = (indices * 2).cast();
+ let indices: u8x16 = shuffle!(indices, [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7]);
+ let v = u8x16::new(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
+ let indices = indices + v;
+ unsafe {
+ let s: u8x16 = crate::mem::transmute(self);
+ crate::mem::transmute(s.shuffle1_dyn(indices))
+ }
+ }
+ }
+ };
+ (u32x4) => {
+ cfg_if! {
+ if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
+ target_feature = "avx"))] {
+ impl Shuffle1Dyn for u32x4 {
+ type Indices = Self;
+ #[inline]
+ fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+ #[cfg(target_arch = "x86")]
+ use crate::arch::x86::{_mm_permutevar_ps};
+ #[cfg(target_arch = "x86_64")]
+ use crate::arch::x86_64::{_mm_permutevar_ps};
+
+ unsafe {
+ crate::mem::transmute(
+ _mm_permutevar_ps(
+ crate::mem::transmute(self.0),
+ crate::mem::transmute(indices.0)
+ )
+ )
+ }
+ }
+ }
+ } else {
+ impl Shuffle1Dyn for u32x4 {
+ type Indices = Self;
+ #[inline]
+ fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+ let indices: u8x4 = (indices * 4).cast();
+ let indices: u8x16 = shuffle!(
+ indices,
+ [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]
+ );
+ let v = u8x16::new(
+ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+ );
+ let indices = indices + v;
+ unsafe {
+ let s: u8x16 =crate::mem::transmute(self);
+ crate::mem::transmute(s.shuffle1_dyn(indices))
+ }
+ }
+ }
+ }
+ }
+ };
+ (u64x2) => {
+ cfg_if! {
+ if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
+ target_feature = "avx"))] {
+ impl Shuffle1Dyn for u64x2 {
+ type Indices = Self;
+ #[inline]
+ fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+ #[cfg(target_arch = "x86")]
+ use crate::arch::x86::{_mm_permutevar_pd};
+ #[cfg(target_arch = "x86_64")]
+ use crate::arch::x86_64::{_mm_permutevar_pd};
+ // _mm_permutevar_pd uses the _second_ bit of each
+ // element to perform the selection, that is: 0b00 => 0,
+ // 0b10 => 1:
+ let indices = indices << 1;
+ unsafe {
+ crate::mem::transmute(
+ _mm_permutevar_pd(
+ crate::mem::transmute(self),
+ crate::mem::transmute(indices)
+ )
+ )
+ }
+ }
+ }
+ } else {
+ impl Shuffle1Dyn for u64x2 {
+ type Indices = Self;
+ #[inline]
+ fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+ let indices: u8x2 = (indices * 8).cast();
+ let indices: u8x16 = shuffle!(
+ indices,
+ [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+ );
+ let v = u8x16::new(
+ 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
+ );
+ let indices = indices + v;
+ unsafe {
+ let s: u8x16 =crate::mem::transmute(self);
+ crate::mem::transmute(s.shuffle1_dyn(indices))
+ }
+ }
+ }
+ }
+ }
+ };
+ (u128x1) => {
+ impl Shuffle1Dyn for u128x1 {
+ type Indices = Self;
+ #[inline]
+ fn shuffle1_dyn(self, _indices: Self::Indices) -> Self {
+ self
+ }
+ }
+ };
+ ($id:ident) => {
+ impl_fallback!($id);
+ };
+}
+
+impl_shuffle1_dyn!(u8x2);
+impl_shuffle1_dyn!(u8x4);
+impl_shuffle1_dyn!(u8x8);
+impl_shuffle1_dyn!(u8x16);
+impl_shuffle1_dyn!(u8x32);
+impl_shuffle1_dyn!(u8x64);
+
+impl_shuffle1_dyn!(u16x2);
+impl_shuffle1_dyn!(u16x4);
+impl_shuffle1_dyn!(u16x8);
+impl_shuffle1_dyn!(u16x16);
+impl_shuffle1_dyn!(u16x32);
+
+impl_shuffle1_dyn!(u32x2);
+impl_shuffle1_dyn!(u32x4);
+impl_shuffle1_dyn!(u32x8);
+impl_shuffle1_dyn!(u32x16);
+
+impl_shuffle1_dyn!(u64x2);
+impl_shuffle1_dyn!(u64x4);
+impl_shuffle1_dyn!(u64x8);
+
+impl_shuffle1_dyn!(usizex2);
+impl_shuffle1_dyn!(usizex4);
+impl_shuffle1_dyn!(usizex8);
+
+impl_shuffle1_dyn!(u128x1);
+impl_shuffle1_dyn!(u128x2);
+impl_shuffle1_dyn!(u128x4);
+
+// Implementation for non-unsigned vector types
+macro_rules! impl_shuffle1_dyn_non_u {
+ ($id:ident, $uid:ident) => {
+ impl Shuffle1Dyn for $id {
+ type Indices = $uid;
+ #[inline]
+ fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+ unsafe {
+ let u: $uid = crate::mem::transmute(self);
+ crate::mem::transmute(u.shuffle1_dyn(indices))
+ }
+ }
+ }
+ };
+}
+
+impl_shuffle1_dyn_non_u!(i8x2, u8x2);
+impl_shuffle1_dyn_non_u!(i8x4, u8x4);
+impl_shuffle1_dyn_non_u!(i8x8, u8x8);
+impl_shuffle1_dyn_non_u!(i8x16, u8x16);
+impl_shuffle1_dyn_non_u!(i8x32, u8x32);
+impl_shuffle1_dyn_non_u!(i8x64, u8x64);
+
+impl_shuffle1_dyn_non_u!(i16x2, u16x2);
+impl_shuffle1_dyn_non_u!(i16x4, u16x4);
+impl_shuffle1_dyn_non_u!(i16x8, u16x8);
+impl_shuffle1_dyn_non_u!(i16x16, u16x16);
+impl_shuffle1_dyn_non_u!(i16x32, u16x32);
+
+impl_shuffle1_dyn_non_u!(i32x2, u32x2);
+impl_shuffle1_dyn_non_u!(i32x4, u32x4);
+impl_shuffle1_dyn_non_u!(i32x8, u32x8);
+impl_shuffle1_dyn_non_u!(i32x16, u32x16);
+
+impl_shuffle1_dyn_non_u!(i64x2, u64x2);
+impl_shuffle1_dyn_non_u!(i64x4, u64x4);
+impl_shuffle1_dyn_non_u!(i64x8, u64x8);
+
+impl_shuffle1_dyn_non_u!(isizex2, usizex2);
+impl_shuffle1_dyn_non_u!(isizex4, usizex4);
+impl_shuffle1_dyn_non_u!(isizex8, usizex8);
+
+impl_shuffle1_dyn_non_u!(i128x1, u128x1);
+impl_shuffle1_dyn_non_u!(i128x2, u128x2);
+impl_shuffle1_dyn_non_u!(i128x4, u128x4);
+
+impl_shuffle1_dyn_non_u!(m8x2, u8x2);
+impl_shuffle1_dyn_non_u!(m8x4, u8x4);
+impl_shuffle1_dyn_non_u!(m8x8, u8x8);
+impl_shuffle1_dyn_non_u!(m8x16, u8x16);
+impl_shuffle1_dyn_non_u!(m8x32, u8x32);
+impl_shuffle1_dyn_non_u!(m8x64, u8x64);
+
+impl_shuffle1_dyn_non_u!(m16x2, u16x2);
+impl_shuffle1_dyn_non_u!(m16x4, u16x4);
+impl_shuffle1_dyn_non_u!(m16x8, u16x8);
+impl_shuffle1_dyn_non_u!(m16x16, u16x16);
+impl_shuffle1_dyn_non_u!(m16x32, u16x32);
+
+impl_shuffle1_dyn_non_u!(m32x2, u32x2);
+impl_shuffle1_dyn_non_u!(m32x4, u32x4);
+impl_shuffle1_dyn_non_u!(m32x8, u32x8);
+impl_shuffle1_dyn_non_u!(m32x16, u32x16);
+
+impl_shuffle1_dyn_non_u!(m64x2, u64x2);
+impl_shuffle1_dyn_non_u!(m64x4, u64x4);
+impl_shuffle1_dyn_non_u!(m64x8, u64x8);
+
+impl_shuffle1_dyn_non_u!(msizex2, usizex2);
+impl_shuffle1_dyn_non_u!(msizex4, usizex4);
+impl_shuffle1_dyn_non_u!(msizex8, usizex8);
+
+impl_shuffle1_dyn_non_u!(m128x1, u128x1);
+impl_shuffle1_dyn_non_u!(m128x2, u128x2);
+impl_shuffle1_dyn_non_u!(m128x4, u128x4);
+
+impl_shuffle1_dyn_non_u!(f32x2, u32x2);
+impl_shuffle1_dyn_non_u!(f32x4, u32x4);
+impl_shuffle1_dyn_non_u!(f32x8, u32x8);
+impl_shuffle1_dyn_non_u!(f32x16, u32x16);
+
+impl_shuffle1_dyn_non_u!(f64x2, u64x2);
+impl_shuffle1_dyn_non_u!(f64x4, u64x4);
+impl_shuffle1_dyn_non_u!(f64x8, u64x8);
+
+// Implementation for non-unsigned vector types
+macro_rules! impl_shuffle1_dyn_ptr {
+ ($id:ident, $uid:ident) => {
+ impl<T> Shuffle1Dyn for $id<T> {
+ type Indices = $uid;
+ #[inline]
+ fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+ unsafe {
+ let u: $uid = crate::mem::transmute(self);
+ crate::mem::transmute(u.shuffle1_dyn(indices))
+ }
+ }
+ }
+ };
+}
+
+impl_shuffle1_dyn_ptr!(cptrx2, usizex2);
+impl_shuffle1_dyn_ptr!(cptrx4, usizex4);
+impl_shuffle1_dyn_ptr!(cptrx8, usizex8);
+
+impl_shuffle1_dyn_ptr!(mptrx2, usizex2);
+impl_shuffle1_dyn_ptr!(mptrx4, usizex4);
+impl_shuffle1_dyn_ptr!(mptrx8, usizex8);
diff --git a/third_party/rust/packed_simd_2/src/codegen/swap_bytes.rs b/third_party/rust/packed_simd_2/src/codegen/swap_bytes.rs
new file mode 100644
index 0000000000..9cf34a3e04
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/swap_bytes.rs
@@ -0,0 +1,149 @@
+//! Horizontal swap bytes reductions.
+
+// FIXME: investigate using `llvm.bswap`
+// https://github.com/rust-lang-nursery/packed_simd/issues/19
+
+use crate::*;
+
+pub(crate) trait SwapBytes {
+ fn swap_bytes(self) -> Self;
+}
+
+macro_rules! impl_swap_bytes {
+ (v16: $($id:ident,)+) => {
+ $(
+ impl SwapBytes for $id {
+ #[inline]
+ fn swap_bytes(self) -> Self {
+ shuffle!(self, [1, 0])
+ }
+ }
+ )+
+ };
+ (v32: $($id:ident,)+) => {
+ $(
+ impl SwapBytes for $id {
+ #[inline]
+ #[allow(clippy::useless_transmute)]
+ fn swap_bytes(self) -> Self {
+ unsafe {
+ let bytes: u8x4 = crate::mem::transmute(self);
+ let result: u8x4 = shuffle!(bytes, [3, 2, 1, 0]);
+ crate::mem::transmute(result)
+ }
+ }
+ }
+ )+
+ };
+ (v64: $($id:ident,)+) => {
+ $(
+ impl SwapBytes for $id {
+ #[inline]
+ #[allow(clippy::useless_transmute)]
+ fn swap_bytes(self) -> Self {
+ unsafe {
+ let bytes: u8x8 = crate::mem::transmute(self);
+ let result: u8x8 = shuffle!(
+ bytes, [7, 6, 5, 4, 3, 2, 1, 0]
+ );
+ crate::mem::transmute(result)
+ }
+ }
+ }
+ )+
+ };
+ (v128: $($id:ident,)+) => {
+ $(
+ impl SwapBytes for $id {
+ #[inline]
+ #[allow(clippy::useless_transmute)]
+ fn swap_bytes(self) -> Self {
+ unsafe {
+ let bytes: u8x16 = crate::mem::transmute(self);
+ let result: u8x16 = shuffle!(bytes, [
+ 15, 14, 13, 12, 11, 10, 9, 8,
+ 7, 6, 5, 4, 3, 2, 1, 0
+ ]);
+ crate::mem::transmute(result)
+ }
+ }
+ }
+ )+
+ };
+ (v256: $($id:ident,)+) => {
+ $(
+ impl SwapBytes for $id {
+ #[inline]
+ #[allow(clippy::useless_transmute)]
+ fn swap_bytes(self) -> Self {
+ unsafe {
+ let bytes: u8x32 = crate::mem::transmute(self);
+ let result: u8x32 = shuffle!(bytes, [
+ 31, 30, 29, 28, 27, 26, 25, 24,
+ 23, 22, 21, 20, 19, 18, 17, 16,
+ 15, 14, 13, 12, 11, 10, 9, 8,
+ 7, 6, 5, 4, 3, 2, 1, 0
+ ]);
+ crate::mem::transmute(result)
+ }
+ }
+ }
+ )+
+ };
+ (v512: $($id:ident,)+) => {
+ $(
+ impl SwapBytes for $id {
+ #[inline]
+ #[allow(clippy::useless_transmute)]
+ fn swap_bytes(self) -> Self {
+ unsafe {
+ let bytes: u8x64 = crate::mem::transmute(self);
+ let result: u8x64 = shuffle!(bytes, [
+ 63, 62, 61, 60, 59, 58, 57, 56,
+ 55, 54, 53, 52, 51, 50, 49, 48,
+ 47, 46, 45, 44, 43, 42, 41, 40,
+ 39, 38, 37, 36, 35, 34, 33, 32,
+ 31, 30, 29, 28, 27, 26, 25, 24,
+ 23, 22, 21, 20, 19, 18, 17, 16,
+ 15, 14, 13, 12, 11, 10, 9, 8,
+ 7, 6, 5, 4, 3, 2, 1, 0
+ ]);
+ crate::mem::transmute(result)
+ }
+ }
+ }
+ )+
+ };
+}
+
+impl_swap_bytes!(v16: u8x2, i8x2,);
+impl_swap_bytes!(v32: u8x4, i8x4, u16x2, i16x2,);
+// FIXME: 64-bit single element vector
+impl_swap_bytes!(v64: u8x8, i8x8, u16x4, i16x4, u32x2, i32x2 /* u64x1, i64x1, */,);
+
+impl_swap_bytes!(v128: u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2, u128x1, i128x1,);
+impl_swap_bytes!(v256: u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, u64x4, i64x4, u128x2, i128x2,);
+
+impl_swap_bytes!(v512: u8x64, i8x64, u16x32, i16x32, u32x16, i32x16, u64x8, i64x8, u128x4, i128x4,);
+
+cfg_if! {
+ if #[cfg(target_pointer_width = "8")] {
+ impl_swap_bytes!(v16: isizex2, usizex2,);
+ impl_swap_bytes!(v32: isizex4, usizex4,);
+ impl_swap_bytes!(v64: isizex8, usizex8,);
+ } else if #[cfg(target_pointer_width = "16")] {
+ impl_swap_bytes!(v32: isizex2, usizex2,);
+ impl_swap_bytes!(v64: isizex4, usizex4,);
+ impl_swap_bytes!(v128: isizex8, usizex8,);
+ } else if #[cfg(target_pointer_width = "32")] {
+ impl_swap_bytes!(v64: isizex2, usizex2,);
+ impl_swap_bytes!(v128: isizex4, usizex4,);
+ impl_swap_bytes!(v256: isizex8, usizex8,);
+ } else if #[cfg(target_pointer_width = "64")] {
+ impl_swap_bytes!(v128: isizex2, usizex2,);
+ impl_swap_bytes!(v256: isizex4, usizex4,);
+ impl_swap_bytes!(v512: isizex8, usizex8,);
+ } else {
+ compile_error!("unsupported target_pointer_width");
+ }
+}
diff --git a/third_party/rust/packed_simd_2/src/codegen/v128.rs b/third_party/rust/packed_simd_2/src/codegen/v128.rs
new file mode 100644
index 0000000000..9506424fad
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/v128.rs
@@ -0,0 +1,46 @@
+//! Internal 128-bit wide vector types
+
+use crate::masks::*;
+
+#[rustfmt::skip]
+impl_simd_array!(
+ [i8; 16]: i8x16 |
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8
+);
+#[rustfmt::skip]
+impl_simd_array!(
+ [u8; 16]: u8x16 |
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8
+);
+#[rustfmt::skip]
+impl_simd_array!(
+ [m8; 16]: m8x16 |
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8
+);
+
+impl_simd_array!([i16; 8]: i16x8 | i16, i16, i16, i16, i16, i16, i16, i16);
+impl_simd_array!([u16; 8]: u16x8 | u16, u16, u16, u16, u16, u16, u16, u16);
+impl_simd_array!([m16; 8]: m16x8 | i16, i16, i16, i16, i16, i16, i16, i16);
+
+impl_simd_array!([i32; 4]: i32x4 | i32, i32, i32, i32);
+impl_simd_array!([u32; 4]: u32x4 | u32, u32, u32, u32);
+impl_simd_array!([f32; 4]: f32x4 | f32, f32, f32, f32);
+impl_simd_array!([m32; 4]: m32x4 | i32, i32, i32, i32);
+
+impl_simd_array!([i64; 2]: i64x2 | i64, i64);
+impl_simd_array!([u64; 2]: u64x2 | u64, u64);
+impl_simd_array!([f64; 2]: f64x2 | f64, f64);
+impl_simd_array!([m64; 2]: m64x2 | i64, i64);
+
+impl_simd_array!([i128; 1]: i128x1 | i128);
+impl_simd_array!([u128; 1]: u128x1 | u128);
+impl_simd_array!([m128; 1]: m128x1 | i128);
diff --git a/third_party/rust/packed_simd_2/src/codegen/v16.rs b/third_party/rust/packed_simd_2/src/codegen/v16.rs
new file mode 100644
index 0000000000..4d55a6d899
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/v16.rs
@@ -0,0 +1,7 @@
+//! Internal 16-bit wide vector types
+
+use crate::masks::*;
+
+impl_simd_array!([i8; 2]: i8x2 | i8, i8);
+impl_simd_array!([u8; 2]: u8x2 | u8, u8);
+impl_simd_array!([m8; 2]: m8x2 | i8, i8);
diff --git a/third_party/rust/packed_simd_2/src/codegen/v256.rs b/third_party/rust/packed_simd_2/src/codegen/v256.rs
new file mode 100644
index 0000000000..5ca4759f0c
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/v256.rs
@@ -0,0 +1,78 @@
+//! Internal 256-bit wide vector types
+
+use crate::masks::*;
+
+#[rustfmt::skip]
+impl_simd_array!(
+ [i8; 32]: i8x32 |
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8
+);
+#[rustfmt::skip]
+impl_simd_array!(
+ [u8; 32]: u8x32 |
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8
+);
+#[rustfmt::skip]
+impl_simd_array!(
+ [m8; 32]: m8x32 |
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8
+);
+#[rustfmt::skip]
+impl_simd_array!(
+ [i16; 16]: i16x16 |
+ i16, i16, i16, i16,
+ i16, i16, i16, i16,
+ i16, i16, i16, i16,
+ i16, i16, i16, i16
+);
+#[rustfmt::skip]
+impl_simd_array!(
+ [u16; 16]: u16x16 |
+ u16, u16, u16, u16,
+ u16, u16, u16, u16,
+ u16, u16, u16, u16,
+ u16, u16, u16, u16
+);
+#[rustfmt::skip]
+impl_simd_array!(
+ [m16; 16]: m16x16 |
+ i16, i16, i16, i16,
+ i16, i16, i16, i16,
+ i16, i16, i16, i16,
+ i16, i16, i16, i16
+);
+
+impl_simd_array!([i32; 8]: i32x8 | i32, i32, i32, i32, i32, i32, i32, i32);
+impl_simd_array!([u32; 8]: u32x8 | u32, u32, u32, u32, u32, u32, u32, u32);
+impl_simd_array!([f32; 8]: f32x8 | f32, f32, f32, f32, f32, f32, f32, f32);
+impl_simd_array!([m32; 8]: m32x8 | i32, i32, i32, i32, i32, i32, i32, i32);
+
+impl_simd_array!([i64; 4]: i64x4 | i64, i64, i64, i64);
+impl_simd_array!([u64; 4]: u64x4 | u64, u64, u64, u64);
+impl_simd_array!([f64; 4]: f64x4 | f64, f64, f64, f64);
+impl_simd_array!([m64; 4]: m64x4 | i64, i64, i64, i64);
+
+impl_simd_array!([i128; 2]: i128x2 | i128, i128);
+impl_simd_array!([u128; 2]: u128x2 | u128, u128);
+impl_simd_array!([m128; 2]: m128x2 | i128, i128);
diff --git a/third_party/rust/packed_simd_2/src/codegen/v32.rs b/third_party/rust/packed_simd_2/src/codegen/v32.rs
new file mode 100644
index 0000000000..ae1dabd00c
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/v32.rs
@@ -0,0 +1,11 @@
+//! Internal 32-bit wide vector types
+
+use crate::masks::*;
+
+impl_simd_array!([i8; 4]: i8x4 | i8, i8, i8, i8);
+impl_simd_array!([u8; 4]: u8x4 | u8, u8, u8, u8);
+impl_simd_array!([m8; 4]: m8x4 | i8, i8, i8, i8);
+
+impl_simd_array!([i16; 2]: i16x2 | i16, i16);
+impl_simd_array!([u16; 2]: u16x2 | u16, u16);
+impl_simd_array!([m16; 2]: m16x2 | i16, i16);
diff --git a/third_party/rust/packed_simd_2/src/codegen/v512.rs b/third_party/rust/packed_simd_2/src/codegen/v512.rs
new file mode 100644
index 0000000000..bf95110340
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/v512.rs
@@ -0,0 +1,145 @@
+//! Internal 512-bit wide vector types
+
+use crate::masks::*;
+
+#[rustfmt::skip]
+impl_simd_array!(
+ [i8; 64]: i8x64 |
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8
+);
+#[rustfmt::skip]
+impl_simd_array!(
+ [u8; 64]: u8x64 |
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8,
+ u8, u8, u8, u8
+);
+#[rustfmt::skip]
+impl_simd_array!(
+ [m8; 64]: m8x64 |
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8,
+ i8, i8, i8, i8
+);
+#[rustfmt::skip]
+impl_simd_array!(
+ [i16; 32]: i16x32 |
+ i16, i16, i16, i16,
+ i16, i16, i16, i16,
+ i16, i16, i16, i16,
+ i16, i16, i16, i16,
+ i16, i16, i16, i16,
+ i16, i16, i16, i16,
+ i16, i16, i16, i16,
+ i16, i16, i16, i16
+);
+#[rustfmt::skip]
+impl_simd_array!(
+ [u16; 32]: u16x32 |
+ u16, u16, u16, u16,
+ u16, u16, u16, u16,
+ u16, u16, u16, u16,
+ u16, u16, u16, u16,
+ u16, u16, u16, u16,
+ u16, u16, u16, u16,
+ u16, u16, u16, u16,
+ u16, u16, u16, u16
+);
+#[rustfmt::skip]
+impl_simd_array!(
+ [m16; 32]: m16x32 |
+ i16, i16, i16, i16,
+ i16, i16, i16, i16,
+ i16, i16, i16, i16,
+ i16, i16, i16, i16,
+ i16, i16, i16, i16,
+ i16, i16, i16, i16,
+ i16, i16, i16, i16,
+ i16, i16, i16, i16
+);
+
+#[rustfmt::skip]
+impl_simd_array!(
+ [i32; 16]: i32x16 |
+ i32, i32, i32, i32,
+ i32, i32, i32, i32,
+ i32, i32, i32, i32,
+ i32, i32, i32, i32
+);
+#[rustfmt::skip]
+impl_simd_array!(
+ [u32; 16]: u32x16 |
+ u32, u32, u32, u32,
+ u32, u32, u32, u32,
+ u32, u32, u32, u32,
+ u32, u32, u32, u32
+);
+#[rustfmt::skip]
+impl_simd_array!(
+ [f32; 16]: f32x16 |
+ f32, f32, f32, f32,
+ f32, f32, f32, f32,
+ f32, f32, f32, f32,
+ f32, f32, f32, f32
+);
+#[rustfmt::skip]
+impl_simd_array!(
+ [m32; 16]: m32x16 |
+ i32, i32, i32, i32,
+ i32, i32, i32, i32,
+ i32, i32, i32, i32,
+ i32, i32, i32, i32
+);
+
+impl_simd_array!([i64; 8]: i64x8 | i64, i64, i64, i64, i64, i64, i64, i64);
+impl_simd_array!([u64; 8]: u64x8 | u64, u64, u64, u64, u64, u64, u64, u64);
+impl_simd_array!([f64; 8]: f64x8 | f64, f64, f64, f64, f64, f64, f64, f64);
+impl_simd_array!([m64; 8]: m64x8 | i64, i64, i64, i64, i64, i64, i64, i64);
+
+impl_simd_array!([i128; 4]: i128x4 | i128, i128, i128, i128);
+impl_simd_array!([u128; 4]: u128x4 | u128, u128, u128, u128);
+impl_simd_array!([m128; 4]: m128x4 | i128, i128, i128, i128);
diff --git a/third_party/rust/packed_simd_2/src/codegen/v64.rs b/third_party/rust/packed_simd_2/src/codegen/v64.rs
new file mode 100644
index 0000000000..3cfb67c1a0
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/v64.rs
@@ -0,0 +1,21 @@
+//! Internal 64-bit wide vector types
+
+use crate::masks::*;
+
+impl_simd_array!([i8; 8]: i8x8 | i8, i8, i8, i8, i8, i8, i8, i8);
+impl_simd_array!([u8; 8]: u8x8 | u8, u8, u8, u8, u8, u8, u8, u8);
+impl_simd_array!([m8; 8]: m8x8 | i8, i8, i8, i8, i8, i8, i8, i8);
+
+impl_simd_array!([i16; 4]: i16x4 | i16, i16, i16, i16);
+impl_simd_array!([u16; 4]: u16x4 | u16, u16, u16, u16);
+impl_simd_array!([m16; 4]: m16x4 | i16, i16, i16, i16);
+
+impl_simd_array!([i32; 2]: i32x2 | i32, i32);
+impl_simd_array!([u32; 2]: u32x2 | u32, u32);
+impl_simd_array!([f32; 2]: f32x2 | f32, f32);
+impl_simd_array!([m32; 2]: m32x2 | i32, i32);
+
+impl_simd_array!([i64; 1]: i64x1 | i64);
+impl_simd_array!([u64; 1]: u64x1 | u64);
+impl_simd_array!([f64; 1]: f64x1 | f64);
+impl_simd_array!([m64; 1]: m64x1 | i64);
diff --git a/third_party/rust/packed_simd_2/src/codegen/vPtr.rs b/third_party/rust/packed_simd_2/src/codegen/vPtr.rs
new file mode 100644
index 0000000000..abd3aa8779
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/vPtr.rs
@@ -0,0 +1,35 @@
+//! Pointer vector types
+
+macro_rules! impl_simd_ptr {
+ ([$ptr_ty:ty; $elem_count:expr]: $tuple_id:ident | $ty:ident
+ | $($tys:ty),*) => {
+ #[derive(Copy, Clone)]
+ #[repr(simd)]
+ pub struct $tuple_id<$ty>($(pub(crate) $tys),*);
+ //^^^^^^^ leaked through SimdArray
+
+ impl<$ty> crate::sealed::Seal for [$ptr_ty; $elem_count] {}
+ impl<$ty> crate::sealed::SimdArray for [$ptr_ty; $elem_count] {
+ type Tuple = $tuple_id<$ptr_ty>;
+ type T = $ptr_ty;
+ const N: usize = $elem_count;
+ type NT = [u32; $elem_count];
+ }
+
+ impl<$ty> crate::sealed::Seal for $tuple_id<$ptr_ty> {}
+ impl<$ty> crate::sealed::Simd for $tuple_id<$ptr_ty> {
+ type Element = $ptr_ty;
+ const LANES: usize = $elem_count;
+ type LanesType = [u32; $elem_count];
+ }
+
+ }
+}
+
+impl_simd_ptr!([*const T; 2]: cptrx2 | T | T, T);
+impl_simd_ptr!([*const T; 4]: cptrx4 | T | T, T, T, T);
+impl_simd_ptr!([*const T; 8]: cptrx8 | T | T, T, T, T, T, T, T, T);
+
+impl_simd_ptr!([*mut T; 2]: mptrx2 | T | T, T);
+impl_simd_ptr!([*mut T; 4]: mptrx4 | T | T, T, T, T);
+impl_simd_ptr!([*mut T; 8]: mptrx8 | T | T, T, T, T, T, T, T, T);
diff --git a/third_party/rust/packed_simd_2/src/codegen/vSize.rs b/third_party/rust/packed_simd_2/src/codegen/vSize.rs
new file mode 100644
index 0000000000..d5db03991d
--- /dev/null
+++ b/third_party/rust/packed_simd_2/src/codegen/vSize.rs
@@ -0,0 +1,16 @@
+//! Vector types with pointer-sized elements
+
+use crate::codegen::pointer_sized_int::{isize_, usize_};
+use crate::masks::*;
+
+impl_simd_array!([isize; 2]: isizex2 | isize_, isize_);
+impl_simd_array!([usize; 2]: usizex2 | usize_, usize_);
+impl_simd_array!([msize; 2]: msizex2 | isize_, isize_);
+
+impl_simd_array!([isize; 4]: isizex4 | isize_, isize_, isize_, isize_);
+impl_simd_array!([usize; 4]: usizex4 | usize_, usize_, usize_, usize_);
+impl_simd_array!([msize; 4]: msizex4 | isize_, isize_, isize_, isize_);
+
+impl_simd_array!([isize; 8]: isizex8 | isize_, isize_, isize_, isize_, isize_, isize_, isize_, isize_);
+impl_simd_array!([usize; 8]: usizex8 | usize_, usize_, usize_, usize_, usize_, usize_, usize_, usize_);
+impl_simd_array!([msize; 8]: msizex8 | isize_, isize_, isize_, isize_, isize_, isize_, isize_, isize_);