From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- vendor/packed_simd_2/src/codegen/bit_manip.rs | 354 +++++++++++++ vendor/packed_simd_2/src/codegen/llvm.rs | 107 ++++ vendor/packed_simd_2/src/codegen/math.rs | 3 + vendor/packed_simd_2/src/codegen/math/float.rs | 19 + vendor/packed_simd_2/src/codegen/math/float/abs.rs | 103 ++++ vendor/packed_simd_2/src/codegen/math/float/cos.rs | 103 ++++ .../packed_simd_2/src/codegen/math/float/cos_pi.rs | 87 ++++ vendor/packed_simd_2/src/codegen/math/float/exp.rs | 112 +++++ vendor/packed_simd_2/src/codegen/math/float/ln.rs | 112 +++++ .../packed_simd_2/src/codegen/math/float/macros.rs | 559 +++++++++++++++++++++ .../src/codegen/math/float/mul_add.rs | 109 ++++ .../src/codegen/math/float/mul_adde.rs | 66 +++ .../packed_simd_2/src/codegen/math/float/powf.rs | 112 +++++ vendor/packed_simd_2/src/codegen/math/float/sin.rs | 103 ++++ .../src/codegen/math/float/sin_cos_pi.rs | 195 +++++++ .../packed_simd_2/src/codegen/math/float/sin_pi.rs | 87 ++++ .../packed_simd_2/src/codegen/math/float/sqrt.rs | 103 ++++ .../packed_simd_2/src/codegen/math/float/sqrte.rs | 67 +++ .../packed_simd_2/src/codegen/math/float/tanh.rs | 117 +++++ .../packed_simd_2/src/codegen/pointer_sized_int.rs | 28 ++ vendor/packed_simd_2/src/codegen/reductions.rs | 1 + .../packed_simd_2/src/codegen/reductions/mask.rs | 69 +++ .../src/codegen/reductions/mask/aarch64.rs | 71 +++ .../src/codegen/reductions/mask/arm.rs | 54 ++ .../src/codegen/reductions/mask/fallback.rs | 6 + .../src/codegen/reductions/mask/fallback_impl.rs | 237 +++++++++ .../src/codegen/reductions/mask/x86.rs | 188 +++++++ .../src/codegen/reductions/mask/x86/avx.rs | 101 ++++ .../src/codegen/reductions/mask/x86/avx2.rs | 35 ++ .../src/codegen/reductions/mask/x86/sse.rs | 36 ++ .../src/codegen/reductions/mask/x86/sse2.rs | 70 +++ vendor/packed_simd_2/src/codegen/shuffle.rs | 150 ++++++ vendor/packed_simd_2/src/codegen/shuffle1_dyn.rs | 411 +++++++++++++++ vendor/packed_simd_2/src/codegen/swap_bytes.rs | 189 +++++++ vendor/packed_simd_2/src/codegen/v128.rs | 46 ++ vendor/packed_simd_2/src/codegen/v16.rs | 7 + vendor/packed_simd_2/src/codegen/v256.rs | 78 +++ vendor/packed_simd_2/src/codegen/v32.rs | 11 + vendor/packed_simd_2/src/codegen/v512.rs | 145 ++++++ vendor/packed_simd_2/src/codegen/v64.rs | 21 + vendor/packed_simd_2/src/codegen/vPtr.rs | 35 ++ vendor/packed_simd_2/src/codegen/vSize.rs | 43 ++ 42 files changed, 4550 insertions(+) create mode 100644 vendor/packed_simd_2/src/codegen/bit_manip.rs create mode 100644 vendor/packed_simd_2/src/codegen/llvm.rs create mode 100644 vendor/packed_simd_2/src/codegen/math.rs create mode 100644 vendor/packed_simd_2/src/codegen/math/float.rs create mode 100644 vendor/packed_simd_2/src/codegen/math/float/abs.rs create mode 100644 vendor/packed_simd_2/src/codegen/math/float/cos.rs create mode 100644 vendor/packed_simd_2/src/codegen/math/float/cos_pi.rs create mode 100644 vendor/packed_simd_2/src/codegen/math/float/exp.rs create mode 100644 vendor/packed_simd_2/src/codegen/math/float/ln.rs create mode 100644 vendor/packed_simd_2/src/codegen/math/float/macros.rs create mode 100644 vendor/packed_simd_2/src/codegen/math/float/mul_add.rs create mode 100644 vendor/packed_simd_2/src/codegen/math/float/mul_adde.rs create mode 100644 vendor/packed_simd_2/src/codegen/math/float/powf.rs create mode 100644 vendor/packed_simd_2/src/codegen/math/float/sin.rs create mode 100644 vendor/packed_simd_2/src/codegen/math/float/sin_cos_pi.rs create mode 100644 vendor/packed_simd_2/src/codegen/math/float/sin_pi.rs create mode 100644 vendor/packed_simd_2/src/codegen/math/float/sqrt.rs create mode 100644 vendor/packed_simd_2/src/codegen/math/float/sqrte.rs create mode 100644 vendor/packed_simd_2/src/codegen/math/float/tanh.rs create mode 100644 vendor/packed_simd_2/src/codegen/pointer_sized_int.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/aarch64.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/arm.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/fallback.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/fallback_impl.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/x86.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx2.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse2.rs create mode 100644 vendor/packed_simd_2/src/codegen/shuffle.rs create mode 100644 vendor/packed_simd_2/src/codegen/shuffle1_dyn.rs create mode 100644 vendor/packed_simd_2/src/codegen/swap_bytes.rs create mode 100644 vendor/packed_simd_2/src/codegen/v128.rs create mode 100644 vendor/packed_simd_2/src/codegen/v16.rs create mode 100644 vendor/packed_simd_2/src/codegen/v256.rs create mode 100644 vendor/packed_simd_2/src/codegen/v32.rs create mode 100644 vendor/packed_simd_2/src/codegen/v512.rs create mode 100644 vendor/packed_simd_2/src/codegen/v64.rs create mode 100644 vendor/packed_simd_2/src/codegen/vPtr.rs create mode 100644 vendor/packed_simd_2/src/codegen/vSize.rs (limited to 'vendor/packed_simd_2/src/codegen') diff --git a/vendor/packed_simd_2/src/codegen/bit_manip.rs b/vendor/packed_simd_2/src/codegen/bit_manip.rs new file mode 100644 index 000000000..83c7d1987 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/bit_manip.rs @@ -0,0 +1,354 @@ +//! LLVM bit manipulation intrinsics. +#[rustfmt::skip] + +use crate::*; + +#[allow(improper_ctypes, dead_code)] +extern "C" { + #[link_name = "llvm.ctlz.v2i8"] + fn ctlz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x2; + #[link_name = "llvm.ctlz.v4i8"] + fn ctlz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x4; + #[link_name = "llvm.ctlz.v8i8"] + fn ctlz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x8; + #[link_name = "llvm.ctlz.v16i8"] + fn ctlz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x16; + #[link_name = "llvm.ctlz.v32i8"] + fn ctlz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x32; + #[link_name = "llvm.ctlz.v64i8"] + fn ctlz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x64; + + #[link_name = "llvm.ctlz.v2i16"] + fn ctlz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x2; + #[link_name = "llvm.ctlz.v4i16"] + fn ctlz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x4; + #[link_name = "llvm.ctlz.v8i16"] + fn ctlz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x8; + #[link_name = "llvm.ctlz.v16i16"] + fn ctlz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x16; + #[link_name = "llvm.ctlz.v32i16"] + fn ctlz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x32; + + #[link_name = "llvm.ctlz.v2i32"] + fn ctlz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x2; + #[link_name = "llvm.ctlz.v4i32"] + fn ctlz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x4; + #[link_name = "llvm.ctlz.v8i32"] + fn ctlz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x8; + #[link_name = "llvm.ctlz.v16i32"] + fn ctlz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x16; + + #[link_name = "llvm.ctlz.v2i64"] + fn ctlz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x2; + #[link_name = "llvm.ctlz.v4i64"] + fn ctlz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x4; + #[link_name = "llvm.ctlz.v8i64"] + fn ctlz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x8; + + #[link_name = "llvm.ctlz.v1i128"] + fn ctlz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x1; + #[link_name = "llvm.ctlz.v2i128"] + fn ctlz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x2; + #[link_name = "llvm.ctlz.v4i128"] + fn ctlz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4; + + #[link_name = "llvm.cttz.v2i8"] + fn cttz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x2; + #[link_name = "llvm.cttz.v4i8"] + fn cttz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x4; + #[link_name = "llvm.cttz.v8i8"] + fn cttz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x8; + #[link_name = "llvm.cttz.v16i8"] + fn cttz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x16; + #[link_name = "llvm.cttz.v32i8"] + fn cttz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x32; + #[link_name = "llvm.cttz.v64i8"] + fn cttz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x64; + + #[link_name = "llvm.cttz.v2i16"] + fn cttz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x2; + #[link_name = "llvm.cttz.v4i16"] + fn cttz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x4; + #[link_name = "llvm.cttz.v8i16"] + fn cttz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x8; + #[link_name = "llvm.cttz.v16i16"] + fn cttz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x16; + #[link_name = "llvm.cttz.v32i16"] + fn cttz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x32; + + #[link_name = "llvm.cttz.v2i32"] + fn cttz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x2; + #[link_name = "llvm.cttz.v4i32"] + fn cttz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x4; + #[link_name = "llvm.cttz.v8i32"] + fn cttz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x8; + #[link_name = "llvm.cttz.v16i32"] + fn cttz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x16; + + #[link_name = "llvm.cttz.v2i64"] + fn cttz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x2; + #[link_name = "llvm.cttz.v4i64"] + fn cttz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x4; + #[link_name = "llvm.cttz.v8i64"] + fn cttz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x8; + + #[link_name = "llvm.cttz.v1i128"] + fn cttz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x1; + #[link_name = "llvm.cttz.v2i128"] + fn cttz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x2; + #[link_name = "llvm.cttz.v4i128"] + fn cttz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4; + + #[link_name = "llvm.ctpop.v2i8"] + fn ctpop_u8x2(x: u8x2) -> u8x2; + #[link_name = "llvm.ctpop.v4i8"] + fn ctpop_u8x4(x: u8x4) -> u8x4; + #[link_name = "llvm.ctpop.v8i8"] + fn ctpop_u8x8(x: u8x8) -> u8x8; + #[link_name = "llvm.ctpop.v16i8"] + fn ctpop_u8x16(x: u8x16) -> u8x16; + #[link_name = "llvm.ctpop.v32i8"] + fn ctpop_u8x32(x: u8x32) -> u8x32; + #[link_name = "llvm.ctpop.v64i8"] + fn ctpop_u8x64(x: u8x64) -> u8x64; + + #[link_name = "llvm.ctpop.v2i16"] + fn ctpop_u16x2(x: u16x2) -> u16x2; + #[link_name = "llvm.ctpop.v4i16"] + fn ctpop_u16x4(x: u16x4) -> u16x4; + #[link_name = "llvm.ctpop.v8i16"] + fn ctpop_u16x8(x: u16x8) -> u16x8; + #[link_name = "llvm.ctpop.v16i16"] + fn ctpop_u16x16(x: u16x16) -> u16x16; + #[link_name = "llvm.ctpop.v32i16"] + fn ctpop_u16x32(x: u16x32) -> u16x32; + + #[link_name = "llvm.ctpop.v2i32"] + fn ctpop_u32x2(x: u32x2) -> u32x2; + #[link_name = "llvm.ctpop.v4i32"] + fn ctpop_u32x4(x: u32x4) -> u32x4; + #[link_name = "llvm.ctpop.v8i32"] + fn ctpop_u32x8(x: u32x8) -> u32x8; + #[link_name = "llvm.ctpop.v16i32"] + fn ctpop_u32x16(x: u32x16) -> u32x16; + + #[link_name = "llvm.ctpop.v2i64"] + fn ctpop_u64x2(x: u64x2) -> u64x2; + #[link_name = "llvm.ctpop.v4i64"] + fn ctpop_u64x4(x: u64x4) -> u64x4; + #[link_name = "llvm.ctpop.v8i64"] + fn ctpop_u64x8(x: u64x8) -> u64x8; + + #[link_name = "llvm.ctpop.v1i128"] + fn ctpop_u128x1(x: u128x1) -> u128x1; + #[link_name = "llvm.ctpop.v2i128"] + fn ctpop_u128x2(x: u128x2) -> u128x2; + #[link_name = "llvm.ctpop.v4i128"] + fn ctpop_u128x4(x: u128x4) -> u128x4; +} + +crate trait BitManip { + fn ctpop(self) -> Self; + fn ctlz(self) -> Self; + fn cttz(self) -> Self; +} + +macro_rules! impl_bit_manip { + (inner: $ty:ident, $scalar:ty, $uty:ident, + $ctpop:ident, $ctlz:ident, $cttz:ident) => { + // FIXME: several LLVM intrinsics break on s390x https://github.com/rust-lang-nursery/packed_simd/issues/192 + #[cfg(target_arch = "s390x")] + impl_bit_manip! { scalar: $ty, $scalar } + #[cfg(not(target_arch = "s390x"))] + impl BitManip for $ty { + #[inline] + fn ctpop(self) -> Self { + let y: $uty = self.cast(); + unsafe { $ctpop(y).cast() } + } + + #[inline] + fn ctlz(self) -> Self { + let y: $uty = self.cast(); + // the ctxx intrinsics need compile-time constant + // `is_zero_undef` + unsafe { $ctlz(y, false).cast() } + } + + #[inline] + fn cttz(self) -> Self { + let y: $uty = self.cast(); + unsafe { $cttz(y, false).cast() } + } + } + }; + (sized_inner: $ty:ident, $scalar:ty, $uty:ident) => { + #[cfg(target_arch = "s390x")] + impl_bit_manip! { scalar: $ty, $scalar } + #[cfg(not(target_arch = "s390x"))] + impl BitManip for $ty { + #[inline] + fn ctpop(self) -> Self { + let y: $uty = self.cast(); + $uty::ctpop(y).cast() + } + + #[inline] + fn ctlz(self) -> Self { + let y: $uty = self.cast(); + $uty::ctlz(y).cast() + } + + #[inline] + fn cttz(self) -> Self { + let y: $uty = self.cast(); + $uty::cttz(y).cast() + } + } + }; + (scalar: $ty:ident, $scalar:ty) => { + impl BitManip for $ty { + #[inline] + fn ctpop(self) -> Self { + let mut ones = self; + for i in 0..Self::lanes() { + ones = ones + .replace(i, self.extract(i).count_ones() as $scalar); + } + ones + } + + #[inline] + fn ctlz(self) -> Self { + let mut lz = self; + for i in 0..Self::lanes() { + lz = lz.replace( + i, + self.extract(i).leading_zeros() as $scalar, + ); + } + lz + } + + #[inline] + fn cttz(self) -> Self { + let mut tz = self; + for i in 0..Self::lanes() { + tz = tz.replace( + i, + self.extract(i).trailing_zeros() as $scalar, + ); + } + tz + } + } + }; + ($uty:ident, $uscalar:ty, $ity:ident, $iscalar:ty, + $ctpop:ident, $ctlz:ident, $cttz:ident) => { + impl_bit_manip! { inner: $uty, $uscalar, $uty, $ctpop, $ctlz, $cttz } + impl_bit_manip! { inner: $ity, $iscalar, $uty, $ctpop, $ctlz, $cttz } + }; + (sized: $usize:ident, $uscalar:ty, $isize:ident, + $iscalar:ty, $ty:ident) => { + impl_bit_manip! { sized_inner: $usize, $uscalar, $ty } + impl_bit_manip! { sized_inner: $isize, $iscalar, $ty } + }; +} + +impl_bit_manip! { u8x2 , u8, i8x2, i8, ctpop_u8x2, ctlz_u8x2, cttz_u8x2 } +impl_bit_manip! { u8x4 , u8, i8x4, i8, ctpop_u8x4, ctlz_u8x4, cttz_u8x4 } +#[cfg(not(target_arch = "aarch64"))] // see below +impl_bit_manip! { u8x8 , u8, i8x8, i8, ctpop_u8x8, ctlz_u8x8, cttz_u8x8 } +impl_bit_manip! { u8x16 , u8, i8x16, i8, ctpop_u8x16, ctlz_u8x16, cttz_u8x16 } +impl_bit_manip! { u8x32 , u8, i8x32, i8, ctpop_u8x32, ctlz_u8x32, cttz_u8x32 } +impl_bit_manip! { u8x64 , u8, i8x64, i8, ctpop_u8x64, ctlz_u8x64, cttz_u8x64 } +impl_bit_manip! { u16x2 , u16, i16x2, i16, ctpop_u16x2, ctlz_u16x2, cttz_u16x2 } +impl_bit_manip! { u16x4 , u16, i16x4, i16, ctpop_u16x4, ctlz_u16x4, cttz_u16x4 } +impl_bit_manip! { u16x8 , u16, i16x8, i16, ctpop_u16x8, ctlz_u16x8, cttz_u16x8 } +impl_bit_manip! { u16x16 , u16, i16x16, i16, ctpop_u16x16, ctlz_u16x16, cttz_u16x16 } +impl_bit_manip! { u16x32 , u16, i16x32, i16, ctpop_u16x32, ctlz_u16x32, cttz_u16x32 } +impl_bit_manip! { u32x2 , u32, i32x2, i32, ctpop_u32x2, ctlz_u32x2, cttz_u32x2 } +impl_bit_manip! { u32x4 , u32, i32x4, i32, ctpop_u32x4, ctlz_u32x4, cttz_u32x4 } +impl_bit_manip! { u32x8 , u32, i32x8, i32, ctpop_u32x8, ctlz_u32x8, cttz_u32x8 } +impl_bit_manip! { u32x16 , u32, i32x16, i32, ctpop_u32x16, ctlz_u32x16, cttz_u32x16 } +impl_bit_manip! { u64x2 , u64, i64x2, i64, ctpop_u64x2, ctlz_u64x2, cttz_u64x2 } +impl_bit_manip! { u64x4 , u64, i64x4, i64, ctpop_u64x4, ctlz_u64x4, cttz_u64x4 } +impl_bit_manip! { u64x8 , u64, i64x8, i64, ctpop_u64x8, ctlz_u64x8, cttz_u64x8 } +impl_bit_manip! { u128x1 , u128, i128x1, i128, ctpop_u128x1, ctlz_u128x1, cttz_u128x1 } +impl_bit_manip! { u128x2 , u128, i128x2, i128, ctpop_u128x2, ctlz_u128x2, cttz_u128x2 } +impl_bit_manip! { u128x4 , u128, i128x4, i128, ctpop_u128x4, ctlz_u128x4, cttz_u128x4 } + +#[cfg(target_arch = "aarch64")] +impl BitManip for u8x8 { + #[inline] + fn ctpop(self) -> Self { + let y: u8x8 = self.cast(); + unsafe { ctpop_u8x8(y).cast() } + } + + #[inline] + fn ctlz(self) -> Self { + let y: u8x8 = self.cast(); + unsafe { ctlz_u8x8(y, false).cast() } + } + + #[inline] + fn cttz(self) -> Self { + // FIXME: LLVM cttz.v8i8 broken on aarch64 https://github.com/rust-lang-nursery/packed_simd/issues/191 + // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64 + // intrinsics + let mut tz = self; + for i in 0..Self::lanes() { + tz = tz.replace(i, self.extract(i).trailing_zeros() as u8); + } + tz + } +} +#[cfg(target_arch = "aarch64")] +impl BitManip for i8x8 { + #[inline] + fn ctpop(self) -> Self { + let y: u8x8 = self.cast(); + unsafe { ctpop_u8x8(y).cast() } + } + + #[inline] + fn ctlz(self) -> Self { + let y: u8x8 = self.cast(); + unsafe { ctlz_u8x8(y, false).cast() } + } + + #[inline] + fn cttz(self) -> Self { + // FIXME: LLVM cttz.v8i8 broken on aarch64 https://github.com/rust-lang-nursery/packed_simd/issues/191 + // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64 + // intrinsics + let mut tz = self; + for i in 0..Self::lanes() { + tz = tz.replace(i, self.extract(i).trailing_zeros() as i8); + } + tz + } +} + +cfg_if! { + if #[cfg(target_pointer_width = "8")] { + impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u8x2 } + impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u8x4 } + impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u8x8 } + } else if #[cfg(target_pointer_width = "16")] { + impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u16x2 } + impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u16x4 } + impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u16x8 } + } else if #[cfg(target_pointer_width = "32")] { + impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u32x2 } + impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u32x4 } + impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u32x8 } + } else if #[cfg(target_pointer_width = "64")] { + impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u64x2 } + impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u64x4 } + impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u64x8 } + } else { + compile_error!("unsupported target_pointer_width"); + } +} diff --git a/vendor/packed_simd_2/src/codegen/llvm.rs b/vendor/packed_simd_2/src/codegen/llvm.rs new file mode 100644 index 000000000..93c6ce6b7 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/llvm.rs @@ -0,0 +1,107 @@ +//! LLVM's platform intrinsics +#![allow(dead_code)] + +use crate::sealed::Shuffle; +#[allow(unused_imports)] // FIXME: spurious warning? +use crate::sealed::Simd; + +// Shuffle intrinsics: expanded in users' crates, therefore public. +extern "platform-intrinsic" { + // FIXME: Passing this intrinsics an `idx` array with an index that is + // out-of-bounds will produce a monomorphization-time error. + // https://github.com/rust-lang-nursery/packed_simd/issues/21 + #[rustc_args_required_const(2)] + pub fn simd_shuffle2(x: T, y: T, idx: [u32; 2]) -> U + where + T: Simd, + ::Element: Shuffle<[u32; 2], Output = U>; + + #[rustc_args_required_const(2)] + pub fn simd_shuffle4(x: T, y: T, idx: [u32; 4]) -> U + where + T: Simd, + ::Element: Shuffle<[u32; 4], Output = U>; + + #[rustc_args_required_const(2)] + pub fn simd_shuffle8(x: T, y: T, idx: [u32; 8]) -> U + where + T: Simd, + ::Element: Shuffle<[u32; 8], Output = U>; + + #[rustc_args_required_const(2)] + pub fn simd_shuffle16(x: T, y: T, idx: [u32; 16]) -> U + where + T: Simd, + ::Element: Shuffle<[u32; 16], Output = U>; + + #[rustc_args_required_const(2)] + pub fn simd_shuffle32(x: T, y: T, idx: [u32; 32]) -> U + where + T: Simd, + ::Element: Shuffle<[u32; 32], Output = U>; + + #[rustc_args_required_const(2)] + pub fn simd_shuffle64(x: T, y: T, idx: [u32; 64]) -> U + where + T: Simd, + ::Element: Shuffle<[u32; 64], Output = U>; +} + +pub use self::simd_shuffle16 as __shuffle_vector16; +pub use self::simd_shuffle2 as __shuffle_vector2; +pub use self::simd_shuffle32 as __shuffle_vector32; +pub use self::simd_shuffle4 as __shuffle_vector4; +pub use self::simd_shuffle64 as __shuffle_vector64; +pub use self::simd_shuffle8 as __shuffle_vector8; + +extern "platform-intrinsic" { + crate fn simd_eq(x: T, y: T) -> U; + crate fn simd_ne(x: T, y: T) -> U; + crate fn simd_lt(x: T, y: T) -> U; + crate fn simd_le(x: T, y: T) -> U; + crate fn simd_gt(x: T, y: T) -> U; + crate fn simd_ge(x: T, y: T) -> U; + + crate fn simd_insert(x: T, idx: u32, val: U) -> T; + crate fn simd_extract(x: T, idx: u32) -> U; + + crate fn simd_cast(x: T) -> U; + + crate fn simd_add(x: T, y: T) -> T; + crate fn simd_sub(x: T, y: T) -> T; + crate fn simd_mul(x: T, y: T) -> T; + crate fn simd_div(x: T, y: T) -> T; + crate fn simd_rem(x: T, y: T) -> T; + crate fn simd_shl(x: T, y: T) -> T; + crate fn simd_shr(x: T, y: T) -> T; + crate fn simd_and(x: T, y: T) -> T; + crate fn simd_or(x: T, y: T) -> T; + crate fn simd_xor(x: T, y: T) -> T; + + crate fn simd_reduce_add_unordered(x: T) -> U; + crate fn simd_reduce_mul_unordered(x: T) -> U; + crate fn simd_reduce_add_ordered(x: T, acc: U) -> U; + crate fn simd_reduce_mul_ordered(x: T, acc: U) -> U; + crate fn simd_reduce_min(x: T) -> U; + crate fn simd_reduce_max(x: T) -> U; + crate fn simd_reduce_min_nanless(x: T) -> U; + crate fn simd_reduce_max_nanless(x: T) -> U; + crate fn simd_reduce_and(x: T) -> U; + crate fn simd_reduce_or(x: T) -> U; + crate fn simd_reduce_xor(x: T) -> U; + crate fn simd_reduce_all(x: T) -> bool; + crate fn simd_reduce_any(x: T) -> bool; + + crate fn simd_select(m: M, a: T, b: T) -> T; + + crate fn simd_fmin(a: T, b: T) -> T; + crate fn simd_fmax(a: T, b: T) -> T; + + crate fn simd_fsqrt(a: T) -> T; + crate fn simd_fma(a: T, b: T, c: T) -> T; + + crate fn simd_gather(value: T, pointers: P, mask: M) -> T; + crate fn simd_scatter(value: T, pointers: P, mask: M); + + crate fn simd_bitmask(value: T) -> U; +} diff --git a/vendor/packed_simd_2/src/codegen/math.rs b/vendor/packed_simd_2/src/codegen/math.rs new file mode 100644 index 000000000..f3997c7f1 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/math.rs @@ -0,0 +1,3 @@ +//! Vertical math operations + +crate mod float; diff --git a/vendor/packed_simd_2/src/codegen/math/float.rs b/vendor/packed_simd_2/src/codegen/math/float.rs new file mode 100644 index 000000000..3743b4990 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/math/float.rs @@ -0,0 +1,19 @@ +//! Vertical floating-point math operations. +#![allow(clippy::useless_transmute)] + +#[macro_use] +crate mod macros; +crate mod abs; +crate mod cos; +crate mod cos_pi; +crate mod exp; +crate mod ln; +crate mod mul_add; +crate mod mul_adde; +crate mod powf; +crate mod sin; +crate mod sin_cos_pi; +crate mod sin_pi; +crate mod sqrt; +crate mod sqrte; +crate mod tanh; diff --git a/vendor/packed_simd_2/src/codegen/math/float/abs.rs b/vendor/packed_simd_2/src/codegen/math/float/abs.rs new file mode 100644 index 000000000..bc4421f61 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/math/float/abs.rs @@ -0,0 +1,103 @@ +//! Vertical floating-point `fabs` +#![allow(unused)] + +// FIXME 64-bit 1 elem vectors fabs + +use crate::*; + +crate trait Abs { + fn abs(self) -> Self; +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.fabs.v2f32"] + fn fabs_v2f32(x: f32x2) -> f32x2; + #[link_name = "llvm.fabs.v4f32"] + fn fabs_v4f32(x: f32x4) -> f32x4; + #[link_name = "llvm.fabs.v8f32"] + fn fabs_v8f32(x: f32x8) -> f32x8; + #[link_name = "llvm.fabs.v16f32"] + fn fabs_v16f32(x: f32x16) -> f32x16; + /* FIXME 64-bit fabsgle elem vectors + #[link_name = "llvm.fabs.v1f64"] + fn fabs_v1f64(x: f64x1) -> f64x1; + */ + #[link_name = "llvm.fabs.v2f64"] + fn fabs_v2f64(x: f64x2) -> f64x2; + #[link_name = "llvm.fabs.v4f64"] + fn fabs_v4f64(x: f64x4) -> f64x4; + #[link_name = "llvm.fabs.v8f64"] + fn fabs_v8f64(x: f64x8) -> f64x8; + + #[link_name = "llvm.fabs.f32"] + fn fabs_f32(x: f32) -> f32; + #[link_name = "llvm.fabs.f64"] + fn fabs_f64(x: f64) -> f64; +} + +gen_unary_impl_table!(Abs, abs); + +cfg_if! { + if #[cfg(target_arch = "s390x")] { + // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 + impl_unary!(f32x2[f32; 2]: fabs_f32); + impl_unary!(f32x4[f32; 4]: fabs_f32); + impl_unary!(f32x8[f32; 8]: fabs_f32); + impl_unary!(f32x16[f32; 16]: fabs_f32); + + impl_unary!(f64x2[f64; 2]: fabs_f64); + impl_unary!(f64x4[f64; 4]: fabs_f64); + impl_unary!(f64x8[f64; 8]: fabs_f64); + } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { + use sleef_sys::*; + cfg_if! { + if #[cfg(target_feature = "avx2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_fabsf4_avx2128); + impl_unary!(f32x16[h => f32x8]: Sleef_fabsf8_avx2); + impl_unary!(f64x8[h => f64x4]: Sleef_fabsd4_avx2); + + impl_unary!(f32x4: Sleef_fabsf4_avx2128); + impl_unary!(f32x8: Sleef_fabsf8_avx2); + impl_unary!(f64x2: Sleef_fabsd2_avx2128); + impl_unary!(f64x4: Sleef_fabsd4_avx2); + } else if #[cfg(target_feature = "avx")] { + impl_unary!(f32x2[t => f32x4]: Sleef_fabsf4_sse4); + impl_unary!(f32x16[h => f32x8]: Sleef_fabsf8_avx); + impl_unary!(f64x8[h => f64x4]: Sleef_fabsd4_avx); + + impl_unary!(f32x4: Sleef_fabsf4_sse4); + impl_unary!(f32x8: Sleef_fabsf8_avx); + impl_unary!(f64x2: Sleef_fabsd2_sse4); + impl_unary!(f64x4: Sleef_fabsd4_avx); + } else if #[cfg(target_feature = "sse4.2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_fabsf4_sse4); + impl_unary!(f32x16[q => f32x4]: Sleef_fabsf4_sse4); + impl_unary!(f64x8[q => f64x2]: Sleef_fabsd2_sse4); + + impl_unary!(f32x4: Sleef_fabsf4_sse4); + impl_unary!(f32x8[h => f32x4]: Sleef_fabsf4_sse4); + impl_unary!(f64x2: Sleef_fabsd2_sse4); + impl_unary!(f64x4[h => f64x2]: Sleef_fabsd2_sse4); + } else { + impl_unary!(f32x2[f32; 2]: fabs_f32); + impl_unary!(f32x16: fabs_v16f32); + impl_unary!(f64x8: fabs_v8f64); + + impl_unary!(f32x4: fabs_v4f32); + impl_unary!(f32x8: fabs_v8f32); + impl_unary!(f64x2: fabs_v2f64); + impl_unary!(f64x4: fabs_v4f64); + } + } + } else { + impl_unary!(f32x2[f32; 2]: fabs_f32); + impl_unary!(f32x4: fabs_v4f32); + impl_unary!(f32x8: fabs_v8f32); + impl_unary!(f32x16: fabs_v16f32); + + impl_unary!(f64x2: fabs_v2f64); + impl_unary!(f64x4: fabs_v4f64); + impl_unary!(f64x8: fabs_v8f64); + } +} diff --git a/vendor/packed_simd_2/src/codegen/math/float/cos.rs b/vendor/packed_simd_2/src/codegen/math/float/cos.rs new file mode 100644 index 000000000..50f6c16da --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/math/float/cos.rs @@ -0,0 +1,103 @@ +//! Vertical floating-point `cos` +#![allow(unused)] + +// FIXME 64-bit 1 elem vector cos + +use crate::*; + +crate trait Cos { + fn cos(self) -> Self; +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.cos.v2f32"] + fn cos_v2f32(x: f32x2) -> f32x2; + #[link_name = "llvm.cos.v4f32"] + fn cos_v4f32(x: f32x4) -> f32x4; + #[link_name = "llvm.cos.v8f32"] + fn cos_v8f32(x: f32x8) -> f32x8; + #[link_name = "llvm.cos.v16f32"] + fn cos_v16f32(x: f32x16) -> f32x16; + /* FIXME 64-bit cosgle elem vectors + #[link_name = "llvm.cos.v1f64"] + fn cos_v1f64(x: f64x1) -> f64x1; + */ + #[link_name = "llvm.cos.v2f64"] + fn cos_v2f64(x: f64x2) -> f64x2; + #[link_name = "llvm.cos.v4f64"] + fn cos_v4f64(x: f64x4) -> f64x4; + #[link_name = "llvm.cos.v8f64"] + fn cos_v8f64(x: f64x8) -> f64x8; + + #[link_name = "llvm.cos.f32"] + fn cos_f32(x: f32) -> f32; + #[link_name = "llvm.cos.f64"] + fn cos_f64(x: f64) -> f64; +} + +gen_unary_impl_table!(Cos, cos); + +cfg_if! { + if #[cfg(target_arch = "s390x")] { + // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 + impl_unary!(f32x2[f32; 2]: cos_f32); + impl_unary!(f32x4[f32; 4]: cos_f32); + impl_unary!(f32x8[f32; 8]: cos_f32); + impl_unary!(f32x16[f32; 16]: cos_f32); + + impl_unary!(f64x2[f64; 2]: cos_f64); + impl_unary!(f64x4[f64; 4]: cos_f64); + impl_unary!(f64x8[f64; 8]: cos_f64); + } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { + use sleef_sys::*; + cfg_if! { + if #[cfg(target_feature = "avx2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_cosf4_u10avx2128); + impl_unary!(f32x16[h => f32x8]: Sleef_cosf8_u10avx2); + impl_unary!(f64x8[h => f64x4]: Sleef_cosd4_u10avx2); + + impl_unary!(f32x4: Sleef_cosf4_u10avx2128); + impl_unary!(f32x8: Sleef_cosf8_u10avx2); + impl_unary!(f64x2: Sleef_cosd2_u10avx2128); + impl_unary!(f64x4: Sleef_cosd4_u10avx2); + } else if #[cfg(target_feature = "avx")] { + impl_unary!(f32x2[t => f32x4]: Sleef_cosf4_u10sse4); + impl_unary!(f32x16[h => f32x8]: Sleef_cosf8_u10avx); + impl_unary!(f64x8[h => f64x4]: Sleef_cosd4_u10avx); + + impl_unary!(f32x4: Sleef_cosf4_u10sse4); + impl_unary!(f32x8: Sleef_cosf8_u10avx); + impl_unary!(f64x2: Sleef_cosd2_u10sse4); + impl_unary!(f64x4: Sleef_cosd4_u10avx); + } else if #[cfg(target_feature = "sse4.2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_cosf4_u10sse4); + impl_unary!(f32x16[q => f32x4]: Sleef_cosf4_u10sse4); + impl_unary!(f64x8[q => f64x2]: Sleef_cosd2_u10sse4); + + impl_unary!(f32x4: Sleef_cosf4_u10sse4); + impl_unary!(f32x8[h => f32x4]: Sleef_cosf4_u10sse4); + impl_unary!(f64x2: Sleef_cosd2_u10sse4); + impl_unary!(f64x4[h => f64x2]: Sleef_cosd2_u10sse4); + } else { + impl_unary!(f32x2[f32; 2]: cos_f32); + impl_unary!(f32x16: cos_v16f32); + impl_unary!(f64x8: cos_v8f64); + + impl_unary!(f32x4: cos_v4f32); + impl_unary!(f32x8: cos_v8f32); + impl_unary!(f64x2: cos_v2f64); + impl_unary!(f64x4: cos_v4f64); + } + } + } else { + impl_unary!(f32x2[f32; 2]: cos_f32); + impl_unary!(f32x4: cos_v4f32); + impl_unary!(f32x8: cos_v8f32); + impl_unary!(f32x16: cos_v16f32); + + impl_unary!(f64x2: cos_v2f64); + impl_unary!(f64x4: cos_v4f64); + impl_unary!(f64x8: cos_v8f64); + } +} diff --git a/vendor/packed_simd_2/src/codegen/math/float/cos_pi.rs b/vendor/packed_simd_2/src/codegen/math/float/cos_pi.rs new file mode 100644 index 000000000..ebff5fd1c --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/math/float/cos_pi.rs @@ -0,0 +1,87 @@ +//! Vertical floating-point `cos` +#![allow(unused)] + +// FIXME 64-bit 1 elem vectors cos_pi + +use crate::*; + +crate trait CosPi { + fn cos_pi(self) -> Self; +} + +gen_unary_impl_table!(CosPi, cos_pi); + +macro_rules! impl_def { + ($vid:ident, $PI:path) => { + impl CosPi for $vid { + #[inline] + fn cos_pi(self) -> Self { + (self * Self::splat($PI)).cos() + } + } + }; +} +macro_rules! impl_def32 { + ($vid:ident) => { + impl_def!($vid, crate::f32::consts::PI); + }; +} +macro_rules! impl_def64 { + ($vid:ident) => { + impl_def!($vid, crate::f64::consts::PI); + }; +} + +cfg_if! { + if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { + use sleef_sys::*; + cfg_if! { + if #[cfg(target_feature = "avx2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_cospif4_u05avx2128); + impl_unary!(f32x16[h => f32x8]: Sleef_cospif8_u05avx2); + impl_unary!(f64x8[h => f64x4]: Sleef_cospid4_u05avx2); + + impl_unary!(f32x4: Sleef_cospif4_u05avx2128); + impl_unary!(f32x8: Sleef_cospif8_u05avx2); + impl_unary!(f64x2: Sleef_cospid2_u05avx2128); + impl_unary!(f64x4: Sleef_cospid4_u05avx2); + } else if #[cfg(target_feature = "avx")] { + impl_unary!(f32x2[t => f32x4]: Sleef_cospif4_u05sse4); + impl_unary!(f32x16[h => f32x8]: Sleef_cospif8_u05avx); + impl_unary!(f64x8[h => f64x4]: Sleef_cospid4_u05avx); + + impl_unary!(f32x4: Sleef_cospif4_u05sse4); + impl_unary!(f32x8: Sleef_cospif8_u05avx); + impl_unary!(f64x2: Sleef_cospid2_u05sse4); + impl_unary!(f64x4: Sleef_cospid4_u05avx); + } else if #[cfg(target_feature = "sse4.2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_cospif4_u05sse4); + impl_unary!(f32x16[q => f32x4]: Sleef_cospif4_u05sse4); + impl_unary!(f64x8[q => f64x2]: Sleef_cospid2_u05sse4); + + impl_unary!(f32x4: Sleef_cospif4_u05sse4); + impl_unary!(f32x8[h => f32x4]: Sleef_cospif4_u05sse4); + impl_unary!(f64x2: Sleef_cospid2_u05sse4); + impl_unary!(f64x4[h => f64x2]: Sleef_cospid2_u05sse4); + } else { + impl_def32!(f32x2); + impl_def32!(f32x4); + impl_def32!(f32x8); + impl_def32!(f32x16); + + impl_def64!(f64x2); + impl_def64!(f64x4); + impl_def64!(f64x8); + } + } + } else { + impl_def32!(f32x2); + impl_def32!(f32x4); + impl_def32!(f32x8); + impl_def32!(f32x16); + + impl_def64!(f64x2); + impl_def64!(f64x4); + impl_def64!(f64x8); + } +} diff --git a/vendor/packed_simd_2/src/codegen/math/float/exp.rs b/vendor/packed_simd_2/src/codegen/math/float/exp.rs new file mode 100644 index 000000000..00d10e9fa --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/math/float/exp.rs @@ -0,0 +1,112 @@ +//! Vertical floating-point `exp` +#![allow(unused)] + +// FIXME 64-bit expgle elem vectors misexpg + +use crate::*; + +crate trait Exp { + fn exp(self) -> Self; +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.exp.v2f32"] + fn exp_v2f32(x: f32x2) -> f32x2; + #[link_name = "llvm.exp.v4f32"] + fn exp_v4f32(x: f32x4) -> f32x4; + #[link_name = "llvm.exp.v8f32"] + fn exp_v8f32(x: f32x8) -> f32x8; + #[link_name = "llvm.exp.v16f32"] + fn exp_v16f32(x: f32x16) -> f32x16; + /* FIXME 64-bit expgle elem vectors + #[link_name = "llvm.exp.v1f64"] + fn exp_v1f64(x: f64x1) -> f64x1; + */ + #[link_name = "llvm.exp.v2f64"] + fn exp_v2f64(x: f64x2) -> f64x2; + #[link_name = "llvm.exp.v4f64"] + fn exp_v4f64(x: f64x4) -> f64x4; + #[link_name = "llvm.exp.v8f64"] + fn exp_v8f64(x: f64x8) -> f64x8; + + #[link_name = "llvm.exp.f32"] + fn exp_f32(x: f32) -> f32; + #[link_name = "llvm.exp.f64"] + fn exp_f64(x: f64) -> f64; +} + +gen_unary_impl_table!(Exp, exp); + +cfg_if! { + if #[cfg(target_arch = "s390x")] { + // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 + impl_unary!(f32x2[f32; 2]: exp_f32); + impl_unary!(f32x4[f32; 4]: exp_f32); + impl_unary!(f32x8[f32; 8]: exp_f32); + impl_unary!(f32x16[f32; 16]: exp_f32); + + impl_unary!(f64x2[f64; 2]: exp_f64); + impl_unary!(f64x4[f64; 4]: exp_f64); + impl_unary!(f64x8[f64; 8]: exp_f64); + } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { + use sleef_sys::*; + cfg_if! { + if #[cfg(target_feature = "avx2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10avx2128); + impl_unary!(f32x16[h => f32x8]: Sleef_expf8_u10avx2); + impl_unary!(f64x8[h => f64x4]: Sleef_expd4_u10avx2); + + impl_unary!(f32x4: Sleef_expf4_u10avx2128); + impl_unary!(f32x8: Sleef_expf8_u10avx2); + impl_unary!(f64x2: Sleef_expd2_u10avx2128); + impl_unary!(f64x4: Sleef_expd4_u10avx2); + } else if #[cfg(target_feature = "avx")] { + impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10sse4); + impl_unary!(f32x16[h => f32x8]: Sleef_expf8_u10avx); + impl_unary!(f64x8[h => f64x4]: Sleef_expd4_u10avx); + + impl_unary!(f32x4: Sleef_expf4_u10sse4); + impl_unary!(f32x8: Sleef_expf8_u10avx); + impl_unary!(f64x2: Sleef_expd2_u10sse4); + impl_unary!(f64x4: Sleef_expd4_u10avx); + } else if #[cfg(target_feature = "sse4.2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10sse4); + impl_unary!(f32x16[q => f32x4]: Sleef_expf4_u10sse4); + impl_unary!(f64x8[q => f64x2]: Sleef_expd2_u10sse4); + + impl_unary!(f32x4: Sleef_expf4_u10sse4); + impl_unary!(f32x8[h => f32x4]: Sleef_expf4_u10sse4); + impl_unary!(f64x2: Sleef_expd2_u10sse4); + impl_unary!(f64x4[h => f64x2]: Sleef_expd2_u10sse4); + } else if #[cfg(target_feature = "sse2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10sse2); + impl_unary!(f32x16[q => f32x4]: Sleef_expf4_u10sse2); + impl_unary!(f64x8[q => f64x2]: Sleef_expd2_u10sse2); + + impl_unary!(f32x4: Sleef_expf4_u10sse2); + impl_unary!(f32x8[h => f32x4]: Sleef_expf4_u10sse2); + impl_unary!(f64x2: Sleef_expd2_u10sse2); + impl_unary!(f64x4[h => f64x2]: Sleef_expd2_u10sse2); + } else { + impl_unary!(f32x2[f32; 2]: exp_f32); + impl_unary!(f32x16: exp_v16f32); + impl_unary!(f64x8: exp_v8f64); + + impl_unary!(f32x4: exp_v4f32); + impl_unary!(f32x8: exp_v8f32); + impl_unary!(f64x2: exp_v2f64); + impl_unary!(f64x4: exp_v4f64); + } + } + } else { + impl_unary!(f32x2[f32; 2]: exp_f32); + impl_unary!(f32x4: exp_v4f32); + impl_unary!(f32x8: exp_v8f32); + impl_unary!(f32x16: exp_v16f32); + + impl_unary!(f64x2: exp_v2f64); + impl_unary!(f64x4: exp_v4f64); + impl_unary!(f64x8: exp_v8f64); + } +} diff --git a/vendor/packed_simd_2/src/codegen/math/float/ln.rs b/vendor/packed_simd_2/src/codegen/math/float/ln.rs new file mode 100644 index 000000000..88a5a6c6c --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/math/float/ln.rs @@ -0,0 +1,112 @@ +//! Vertical floating-point `ln` +#![allow(unused)] + +// FIXME 64-bit lngle elem vectors mislng + +use crate::*; + +crate trait Ln { + fn ln(self) -> Self; +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.log.v2f32"] + fn ln_v2f32(x: f32x2) -> f32x2; + #[link_name = "llvm.log.v4f32"] + fn ln_v4f32(x: f32x4) -> f32x4; + #[link_name = "llvm.log.v8f32"] + fn ln_v8f32(x: f32x8) -> f32x8; + #[link_name = "llvm.log.v16f32"] + fn ln_v16f32(x: f32x16) -> f32x16; + /* FIXME 64-bit lngle elem vectors + #[link_name = "llvm.log.v1f64"] + fn ln_v1f64(x: f64x1) -> f64x1; + */ + #[link_name = "llvm.log.v2f64"] + fn ln_v2f64(x: f64x2) -> f64x2; + #[link_name = "llvm.log.v4f64"] + fn ln_v4f64(x: f64x4) -> f64x4; + #[link_name = "llvm.log.v8f64"] + fn ln_v8f64(x: f64x8) -> f64x8; + + #[link_name = "llvm.log.f32"] + fn ln_f32(x: f32) -> f32; + #[link_name = "llvm.log.f64"] + fn ln_f64(x: f64) -> f64; +} + +gen_unary_impl_table!(Ln, ln); + +cfg_if! { + if #[cfg(target_arch = "s390x")] { + // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 + impl_unary!(f32x2[f32; 2]: ln_f32); + impl_unary!(f32x4[f32; 4]: ln_f32); + impl_unary!(f32x8[f32; 8]: ln_f32); + impl_unary!(f32x16[f32; 16]: ln_f32); + + impl_unary!(f64x2[f64; 2]: ln_f64); + impl_unary!(f64x4[f64; 4]: ln_f64); + impl_unary!(f64x8[f64; 8]: ln_f64); + } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { + use sleef_sys::*; + cfg_if! { + if #[cfg(target_feature = "avx2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10avx2128); + impl_unary!(f32x16[h => f32x8]: Sleef_logf8_u10avx2); + impl_unary!(f64x8[h => f64x4]: Sleef_logd4_u10avx2); + + impl_unary!(f32x4: Sleef_logf4_u10avx2128); + impl_unary!(f32x8: Sleef_logf8_u10avx2); + impl_unary!(f64x2: Sleef_logd2_u10avx2128); + impl_unary!(f64x4: Sleef_logd4_u10avx2); + } else if #[cfg(target_feature = "avx")] { + impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10sse4); + impl_unary!(f32x16[h => f32x8]: Sleef_logf8_u10avx); + impl_unary!(f64x8[h => f64x4]: Sleef_logd4_u10avx); + + impl_unary!(f32x4: Sleef_logf4_u10sse4); + impl_unary!(f32x8: Sleef_logf8_u10avx); + impl_unary!(f64x2: Sleef_logd2_u10sse4); + impl_unary!(f64x4: Sleef_logd4_u10avx); + } else if #[cfg(target_feature = "sse4.2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10sse4); + impl_unary!(f32x16[q => f32x4]: Sleef_logf4_u10sse4); + impl_unary!(f64x8[q => f64x2]: Sleef_logd2_u10sse4); + + impl_unary!(f32x4: Sleef_logf4_u10sse4); + impl_unary!(f32x8[h => f32x4]: Sleef_logf4_u10sse4); + impl_unary!(f64x2: Sleef_logd2_u10sse4); + impl_unary!(f64x4[h => f64x2]: Sleef_logd2_u10sse4); + } else if #[cfg(target_feature = "sse2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10sse2); + impl_unary!(f32x16[q => f32x4]: Sleef_logf4_u10sse2); + impl_unary!(f64x8[q => f64x2]: Sleef_logd2_u10sse2); + + impl_unary!(f32x4: Sleef_logf4_u10sse2); + impl_unary!(f32x8[h => f32x4]: Sleef_logf4_u10sse2); + impl_unary!(f64x2: Sleef_logd2_u10sse2); + impl_unary!(f64x4[h => f64x2]: Sleef_logd2_u10sse2); + } else { + impl_unary!(f32x2[f32; 2]: ln_f32); + impl_unary!(f32x16: ln_v16f32); + impl_unary!(f64x8: ln_v8f64); + + impl_unary!(f32x4: ln_v4f32); + impl_unary!(f32x8: ln_v8f32); + impl_unary!(f64x2: ln_v2f64); + impl_unary!(f64x4: ln_v4f64); + } + } + } else { + impl_unary!(f32x2[f32; 2]: ln_f32); + impl_unary!(f32x4: ln_v4f32); + impl_unary!(f32x8: ln_v8f32); + impl_unary!(f32x16: ln_v16f32); + + impl_unary!(f64x2: ln_v2f64); + impl_unary!(f64x4: ln_v4f64); + impl_unary!(f64x8: ln_v8f64); + } +} diff --git a/vendor/packed_simd_2/src/codegen/math/float/macros.rs b/vendor/packed_simd_2/src/codegen/math/float/macros.rs new file mode 100644 index 000000000..02d0ca3f5 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/math/float/macros.rs @@ -0,0 +1,559 @@ +//! Utility macros +#![allow(unused)] + + +macro_rules! impl_unary_ { + // implementation mapping 1:1 + (vec | $trait_id:ident, $trait_method:ident, $vec_id:ident, + $fun:ident) => { + impl $trait_id for $vec_id { + #[inline] + fn $trait_method(self) -> Self { + unsafe { + use crate::mem::transmute; + transmute($fun(transmute(self))) + } + } + } + }; + // implementation mapping 1:1 for when `$fun` is a generic function + // like some of the fp math rustc intrinsics (e.g. `fn fun(x: T) -> T`). + (gen | $trait_id:ident, $trait_method:ident, $vec_id:ident, + $fun:ident) => { + impl $trait_id for $vec_id { + #[inline] + fn $trait_method(self) -> Self { + unsafe { + use crate::mem::transmute; + transmute($fun(self.0)) + } + } + } + }; + (scalar | $trait_id:ident, $trait_method:ident, + $vec_id:ident, [$sid:ident; $scount:expr], $fun:ident) => { + impl $trait_id for $vec_id { + #[inline] + fn $trait_method(self) -> Self { + unsafe { + union U { + vec: $vec_id, + scalars: [$sid; $scount], + } + let mut scalars = U { vec: self }.scalars; + for i in &mut scalars { + *i = $fun(*i); + } + U { scalars }.vec + } + } + } + }; + // implementation calling fun twice on each of the vector halves: + (halves | $trait_id:ident, $trait_method:ident, $vec_id:ident, + $vech_id:ident, $fun:ident) => { + impl $trait_id for $vec_id { + #[inline] + fn $trait_method(self) -> Self { + unsafe { + use crate::mem::transmute; + union U { + vec: $vec_id, + halves: [$vech_id; 2], + } + + let mut halves = U { vec: self }.halves; + + *halves.get_unchecked_mut(0) = + transmute($fun(transmute(*halves.get_unchecked(0)))); + *halves.get_unchecked_mut(1) = + transmute($fun(transmute(*halves.get_unchecked(1)))); + + U { halves }.vec + } + } + } + }; + // implementation calling fun four times on each of the vector quarters: + (quarter | $trait_id:ident, $trait_method:ident, $vec_id:ident, + $vecq_id:ident, $fun:ident) => { + impl $trait_id for $vec_id { + #[inline] + fn $trait_method(self) -> Self { + unsafe { + use crate::mem::transmute; + union U { + vec: $vec_id, + quarters: [$vecq_id; 4], + } + + let mut quarters = U { vec: self }.quarters; + + *quarters.get_unchecked_mut(0) = + transmute($fun(transmute(*quarters.get_unchecked(0)))); + *quarters.get_unchecked_mut(1) = + transmute($fun(transmute(*quarters.get_unchecked(1)))); + *quarters.get_unchecked_mut(2) = + transmute($fun(transmute(*quarters.get_unchecked(2)))); + *quarters.get_unchecked_mut(3) = + transmute($fun(transmute(*quarters.get_unchecked(3)))); + + U { quarters }.vec + } + } + } + }; + // implementation calling fun once on a vector twice as large: + (twice | $trait_id:ident, $trait_method:ident, $vec_id:ident, + $vect_id:ident, $fun:ident) => { + impl $trait_id for $vec_id { + #[inline] + fn $trait_method(self) -> Self { + unsafe { + use crate::mem::{transmute, uninitialized}; + + union U { + vec: [$vec_id; 2], + twice: $vect_id, + } + + let twice = U { vec: [self, uninitialized()] }.twice; + let twice = transmute($fun(transmute(twice))); + + *(U { twice }.vec.get_unchecked(0)) + } + } + } + }; +} + +macro_rules! gen_unary_impl_table { + ($trait_id:ident, $trait_method:ident) => { + macro_rules! impl_unary { + ($vid:ident: $fun:ident) => { + impl_unary_!(vec | $trait_id, $trait_method, $vid, $fun); + }; + ($vid:ident[g]: $fun:ident) => { + impl_unary_!(gen | $trait_id, $trait_method, $vid, $fun); + }; + ($vid:ident[$sid:ident; $sc:expr]: $fun:ident) => { + impl_unary_!( + scalar | $trait_id, + $trait_method, + $vid, + [$sid; $sc], + $fun + ); + }; + ($vid:ident[s]: $fun:ident) => { + impl_unary_!(scalar | $trait_id, $trait_method, $vid, $fun); + }; + ($vid:ident[h => $vid_h:ident]: $fun:ident) => { + impl_unary_!( + halves | $trait_id, + $trait_method, + $vid, + $vid_h, + $fun + ); + }; + ($vid:ident[q => $vid_q:ident]: $fun:ident) => { + impl_unary_!( + quarter | $trait_id, + $trait_method, + $vid, + $vid_q, + $fun + ); + }; + ($vid:ident[t => $vid_t:ident]: $fun:ident) => { + impl_unary_!( + twice | $trait_id, + $trait_method, + $vid, + $vid_t, + $fun + ); + }; + } + }; +} + +macro_rules! impl_tertiary_ { + // implementation mapping 1:1 + (vec | $trait_id:ident, $trait_method:ident, $vec_id:ident, + $fun:ident) => { + impl $trait_id for $vec_id { + #[inline] + fn $trait_method(self, y: Self, z: Self) -> Self { + unsafe { + use crate::mem::transmute; + transmute($fun( + transmute(self), + transmute(y), + transmute(z), + )) + } + } + } + }; + (scalar | $trait_id:ident, $trait_method:ident, + $vec_id:ident, [$sid:ident; $scount:expr], $fun:ident) => { + impl $trait_id for $vec_id { + #[inline] + fn $trait_method(self, y: Self, z: Self) -> Self { + unsafe { + union U { + vec: $vec_id, + scalars: [$sid; $scount], + } + let mut x = U { vec: self }.scalars; + let y = U { vec: y }.scalars; + let z = U { vec: z }.scalars; + for (x, (y, z)) in (&mut scalars).zip(&y).zip(&z) { + *i = $fun(*i, *y, *z); + } + U { vec: x }.vec + } + } + } + }; + // implementation calling fun twice on each of the vector halves: + (halves | $trait_id:ident, $trait_method:ident, $vec_id:ident, + $vech_id:ident, $fun:ident) => { + impl $trait_id for $vec_id { + #[inline] + fn $trait_method(self, y: Self, z: Self) -> Self { + unsafe { + use crate::mem::transmute; + union U { + vec: $vec_id, + halves: [$vech_id; 2], + } + + let mut x_halves = U { vec: self }.halves; + let y_halves = U { vec: y }.halves; + let z_halves = U { vec: z }.halves; + + *x_halves.get_unchecked_mut(0) = transmute($fun( + transmute(*x_halves.get_unchecked(0)), + transmute(*y_halves.get_unchecked(0)), + transmute(*z_halves.get_unchecked(0)), + )); + *x_halves.get_unchecked_mut(1) = transmute($fun( + transmute(*x_halves.get_unchecked(1)), + transmute(*y_halves.get_unchecked(1)), + transmute(*z_halves.get_unchecked(1)), + )); + + U { halves: x_halves }.vec + } + } + } + }; + // implementation calling fun four times on each of the vector quarters: + (quarter | $trait_id:ident, $trait_method:ident, $vec_id:ident, + $vecq_id:ident, $fun:ident) => { + impl $trait_id for $vec_id { + #[inline] + fn $trait_method(self, y: Self, z: Self) -> Self { + unsafe { + use crate::mem::transmute; + union U { + vec: $vec_id, + quarters: [$vecq_id; 4], + } + + let mut x_quarters = U { vec: self }.quarters; + let y_quarters = U { vec: y }.quarters; + let z_quarters = U { vec: z }.quarters; + + *x_quarters.get_unchecked_mut(0) = transmute($fun( + transmute(*x_quarters.get_unchecked(0)), + transmute(*y_quarters.get_unchecked(0)), + transmute(*z_quarters.get_unchecked(0)), + )); + + *x_quarters.get_unchecked_mut(1) = transmute($fun( + transmute(*x_quarters.get_unchecked(1)), + transmute(*y_quarters.get_unchecked(1)), + transmute(*z_quarters.get_unchecked(1)), + )); + + *x_quarters.get_unchecked_mut(2) = transmute($fun( + transmute(*x_quarters.get_unchecked(2)), + transmute(*y_quarters.get_unchecked(2)), + transmute(*z_quarters.get_unchecked(2)), + )); + + *x_quarters.get_unchecked_mut(3) = transmute($fun( + transmute(*x_quarters.get_unchecked(3)), + transmute(*y_quarters.get_unchecked(3)), + transmute(*z_quarters.get_unchecked(3)), + )); + + U { quarters: x_quarters }.vec + } + } + } + }; + // implementation calling fun once on a vector twice as large: + (twice | $trait_id:ident, $trait_method:ident, $vec_id:ident, + $vect_id:ident, $fun:ident) => { + impl $trait_id for $vec_id { + #[inline] + fn $trait_method(self, y: Self, z: Self) -> Self { + unsafe { + use crate::mem::{transmute, uninitialized}; + + union U { + vec: [$vec_id; 2], + twice: $vect_id, + } + + let x_twice = U { vec: [self, uninitialized()] }.twice; + let y_twice = U { vec: [y, uninitialized()] }.twice; + let z_twice = U { vec: [z, uninitialized()] }.twice; + let twice: $vect_id = transmute($fun( + transmute(x_twice), + transmute(y_twice), + transmute(z_twice), + )); + + *(U { twice }.vec.get_unchecked(0)) + } + } + } + }; +} + +macro_rules! gen_tertiary_impl_table { + ($trait_id:ident, $trait_method:ident) => { + macro_rules! impl_tertiary { + ($vid:ident: $fun:ident) => { + impl_tertiary_!(vec | $trait_id, $trait_method, $vid, $fun); + }; + ($vid:ident[$sid:ident; $sc:expr]: $fun:ident) => { + impl_tertiary_!( + scalar | $trait_id, + $trait_method, + $vid, + [$sid; $sc], + $fun + ); + }; + ($vid:ident[s]: $fun:ident) => { + impl_tertiary_!(scalar | $trait_id, $trait_method, $vid, $fun); + }; + ($vid:ident[h => $vid_h:ident]: $fun:ident) => { + impl_tertiary_!( + halves | $trait_id, + $trait_method, + $vid, + $vid_h, + $fun + ); + }; + ($vid:ident[q => $vid_q:ident]: $fun:ident) => { + impl_tertiary_!( + quarter | $trait_id, + $trait_method, + $vid, + $vid_q, + $fun + ); + }; + ($vid:ident[t => $vid_t:ident]: $fun:ident) => { + impl_tertiary_!( + twice | $trait_id, + $trait_method, + $vid, + $vid_t, + $fun + ); + }; + } + }; +} + +macro_rules! impl_binary_ { + // implementation mapping 1:1 + (vec | $trait_id:ident, $trait_method:ident, $vec_id:ident, + $fun:ident) => { + impl $trait_id for $vec_id { + #[inline] + fn $trait_method(self, y: Self) -> Self { + unsafe { + use crate::mem::transmute; + transmute($fun(transmute(self), transmute(y))) + } + } + } + }; + (scalar | $trait_id:ident, $trait_method:ident, + $vec_id:ident, [$sid:ident; $scount:expr], $fun:ident) => { + impl $trait_id for $vec_id { + #[inline] + fn $trait_method(self, y: Self) -> Self { + unsafe { + union U { + vec: $vec_id, + scalars: [$sid; $scount], + } + let mut x = U { vec: self }.scalars; + let y = U { vec: y }.scalars; + for (x, y) in x.iter_mut().zip(&y) { + *x = $fun(*x, *y); + } + U { scalars: x }.vec + } + } + } + }; + // implementation calling fun twice on each of the vector halves: + (halves | $trait_id:ident, $trait_method:ident, $vec_id:ident, + $vech_id:ident, $fun:ident) => { + impl $trait_id for $vec_id { + #[inline] + fn $trait_method(self, y: Self) -> Self { + unsafe { + use crate::mem::transmute; + union U { + vec: $vec_id, + halves: [$vech_id; 2], + } + + let mut x_halves = U { vec: self }.halves; + let y_halves = U { vec: y }.halves; + + *x_halves.get_unchecked_mut(0) = transmute($fun( + transmute(*x_halves.get_unchecked(0)), + transmute(*y_halves.get_unchecked(0)), + )); + *x_halves.get_unchecked_mut(1) = transmute($fun( + transmute(*x_halves.get_unchecked(1)), + transmute(*y_halves.get_unchecked(1)), + )); + + U { halves: x_halves }.vec + } + } + } + }; + // implementation calling fun four times on each of the vector quarters: + (quarter | $trait_id:ident, $trait_method:ident, $vec_id:ident, + $vecq_id:ident, $fun:ident) => { + impl $trait_id for $vec_id { + #[inline] + fn $trait_method(self, y: Self) -> Self { + unsafe { + use crate::mem::transmute; + union U { + vec: $vec_id, + quarters: [$vecq_id; 4], + } + + let mut x_quarters = U { vec: self }.quarters; + let y_quarters = U { vec: y }.quarters; + + *x_quarters.get_unchecked_mut(0) = transmute($fun( + transmute(*x_quarters.get_unchecked(0)), + transmute(*y_quarters.get_unchecked(0)), + )); + + *x_quarters.get_unchecked_mut(1) = transmute($fun( + transmute(*x_quarters.get_unchecked(1)), + transmute(*y_quarters.get_unchecked(1)), + )); + + *x_quarters.get_unchecked_mut(2) = transmute($fun( + transmute(*x_quarters.get_unchecked(2)), + transmute(*y_quarters.get_unchecked(2)), + )); + + *x_quarters.get_unchecked_mut(3) = transmute($fun( + transmute(*x_quarters.get_unchecked(3)), + transmute(*y_quarters.get_unchecked(3)), + )); + + U { quarters: x_quarters }.vec + } + } + } + }; + // implementation calling fun once on a vector twice as large: + (twice | $trait_id:ident, $trait_method:ident, $vec_id:ident, + $vect_id:ident, $fun:ident) => { + impl $trait_id for $vec_id { + #[inline] + fn $trait_method(self, y: Self) -> Self { + unsafe { + use crate::mem::{transmute, uninitialized}; + + union U { + vec: [$vec_id; 2], + twice: $vect_id, + } + + let x_twice = U { vec: [self, uninitialized()] }.twice; + let y_twice = U { vec: [y, uninitialized()] }.twice; + let twice: $vect_id = transmute($fun( + transmute(x_twice), + transmute(y_twice), + )); + + *(U { twice }.vec.get_unchecked(0)) + } + } + } + }; +} + +macro_rules! gen_binary_impl_table { + ($trait_id:ident, $trait_method:ident) => { + macro_rules! impl_binary { + ($vid:ident: $fun:ident) => { + impl_binary_!(vec | $trait_id, $trait_method, $vid, $fun); + }; + ($vid:ident[$sid:ident; $sc:expr]: $fun:ident) => { + impl_binary_!( + scalar | $trait_id, + $trait_method, + $vid, + [$sid; $sc], + $fun + ); + }; + ($vid:ident[s]: $fun:ident) => { + impl_binary_!(scalar | $trait_id, $trait_method, $vid, $fun); + }; + ($vid:ident[h => $vid_h:ident]: $fun:ident) => { + impl_binary_!( + halves | $trait_id, + $trait_method, + $vid, + $vid_h, + $fun + ); + }; + ($vid:ident[q => $vid_q:ident]: $fun:ident) => { + impl_binary_!( + quarter | $trait_id, + $trait_method, + $vid, + $vid_q, + $fun + ); + }; + ($vid:ident[t => $vid_t:ident]: $fun:ident) => { + impl_binary_!( + twice | $trait_id, + $trait_method, + $vid, + $vid_t, + $fun + ); + }; + } + }; +} diff --git a/vendor/packed_simd_2/src/codegen/math/float/mul_add.rs b/vendor/packed_simd_2/src/codegen/math/float/mul_add.rs new file mode 100644 index 000000000..f48a57dc4 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/math/float/mul_add.rs @@ -0,0 +1,109 @@ +//! Vertical floating-point `mul_add` +#![allow(unused)] +use crate::*; + +// FIXME: 64-bit 1 element mul_add + +crate trait MulAdd { + fn mul_add(self, y: Self, z: Self) -> Self; +} + +#[cfg(not(target_arch = "s390x"))] +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.fma.v2f32"] + fn fma_v2f32(x: f32x2, y: f32x2, z: f32x2) -> f32x2; + #[link_name = "llvm.fma.v4f32"] + fn fma_v4f32(x: f32x4, y: f32x4, z: f32x4) -> f32x4; + #[link_name = "llvm.fma.v8f32"] + fn fma_v8f32(x: f32x8, y: f32x8, z: f32x8) -> f32x8; + #[link_name = "llvm.fma.v16f32"] + fn fma_v16f32(x: f32x16, y: f32x16, z: f32x16) -> f32x16; + /* FIXME 64-bit single elem vectors + #[link_name = "llvm.fma.v1f64"] + fn fma_v1f64(x: f64x1, y: f64x1, z: f64x1) -> f64x1; + */ + #[link_name = "llvm.fma.v2f64"] + fn fma_v2f64(x: f64x2, y: f64x2, z: f64x2) -> f64x2; + #[link_name = "llvm.fma.v4f64"] + fn fma_v4f64(x: f64x4, y: f64x4, z: f64x4) -> f64x4; + #[link_name = "llvm.fma.v8f64"] + fn fma_v8f64(x: f64x8, y: f64x8, z: f64x8) -> f64x8; +} + +gen_tertiary_impl_table!(MulAdd, mul_add); + +cfg_if! { + if #[cfg(target_arch = "s390x")] { + // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 + macro_rules! impl_broken { + ($id:ident) => { + impl MulAdd for $id { + #[inline] + fn mul_add(self, y: Self, z: Self) -> Self { + self * y + z + } + } + }; + } + + impl_broken!(f32x2); + impl_broken!(f32x4); + impl_broken!(f32x8); + impl_broken!(f32x16); + + impl_broken!(f64x2); + impl_broken!(f64x4); + impl_broken!(f64x8); + } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { + use sleef_sys::*; + cfg_if! { + if #[cfg(target_feature = "avx2")] { + impl_tertiary!(f32x2[t => f32x4]: Sleef_fmaf4_avx2128); + impl_tertiary!(f32x16[h => f32x8]: Sleef_fmaf8_avx2); + impl_tertiary!(f64x8[h => f64x4]: Sleef_fmad4_avx2); + + impl_tertiary!(f32x4: Sleef_fmaf4_avx2128); + impl_tertiary!(f32x8: Sleef_fmaf8_avx2); + impl_tertiary!(f64x2: Sleef_fmad2_avx2128); + impl_tertiary!(f64x4: Sleef_fmad4_avx2); + } else if #[cfg(target_feature = "avx")] { + impl_tertiary!(f32x2[t => f32x4]: Sleef_fmaf4_sse4); + impl_tertiary!(f32x16[h => f32x8]: Sleef_fmaf8_avx); + impl_tertiary!(f64x8[h => f64x4]: Sleef_fmad4_avx); + + impl_tertiary!(f32x4: Sleef_fmaf4_sse4); + impl_tertiary!(f32x8: Sleef_fmaf8_avx); + impl_tertiary!(f64x2: Sleef_fmad2_sse4); + impl_tertiary!(f64x4: Sleef_fmad4_avx); + } else if #[cfg(target_feature = "sse4.2")] { + impl_tertiary!(f32x2[t => f32x4]: Sleef_fmaf4_sse4); + impl_tertiary!(f32x16[q => f32x4]: Sleef_fmaf4_sse4); + impl_tertiary!(f64x8[q => f64x2]: Sleef_fmad2_sse4); + + impl_tertiary!(f32x4: Sleef_fmaf4_sse4); + impl_tertiary!(f32x8[h => f32x4]: Sleef_fmaf4_sse4); + impl_tertiary!(f64x2: Sleef_fmad2_sse4); + impl_tertiary!(f64x4[h => f64x2]: Sleef_fmad2_sse4); + } else { + impl_tertiary!(f32x2: fma_v2f32); + impl_tertiary!(f32x16: fma_v16f32); + impl_tertiary!(f64x8: fma_v8f64); + + impl_tertiary!(f32x4: fma_v4f32); + impl_tertiary!(f32x8: fma_v8f32); + impl_tertiary!(f64x2: fma_v2f64); + impl_tertiary!(f64x4: fma_v4f64); + } + } + } else { + impl_tertiary!(f32x2: fma_v2f32); + impl_tertiary!(f32x4: fma_v4f32); + impl_tertiary!(f32x8: fma_v8f32); + impl_tertiary!(f32x16: fma_v16f32); + // impl_tertiary!(f64x1: fma_v1f64); // FIXME 64-bit fmagle elem vectors + impl_tertiary!(f64x2: fma_v2f64); + impl_tertiary!(f64x4: fma_v4f64); + impl_tertiary!(f64x8: fma_v8f64); + } +} diff --git a/vendor/packed_simd_2/src/codegen/math/float/mul_adde.rs b/vendor/packed_simd_2/src/codegen/math/float/mul_adde.rs new file mode 100644 index 000000000..8c41fb131 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/math/float/mul_adde.rs @@ -0,0 +1,66 @@ +//! Approximation for floating-point `mul_add` +use crate::*; + +// FIXME: 64-bit 1 element mul_adde + +crate trait MulAddE { + fn mul_adde(self, y: Self, z: Self) -> Self; +} + +#[cfg(not(target_arch = "s390x"))] +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.fmuladd.v2f32"] + fn fmuladd_v2f32(x: f32x2, y: f32x2, z: f32x2) -> f32x2; + #[link_name = "llvm.fmuladd.v4f32"] + fn fmuladd_v4f32(x: f32x4, y: f32x4, z: f32x4) -> f32x4; + #[link_name = "llvm.fmuladd.v8f32"] + fn fmuladd_v8f32(x: f32x8, y: f32x8, z: f32x8) -> f32x8; + #[link_name = "llvm.fmuladd.v16f32"] + fn fmuladd_v16f32(x: f32x16, y: f32x16, z: f32x16) -> f32x16; + /* FIXME 64-bit single elem vectors + #[link_name = "llvm.fmuladd.v1f64"] + fn fmuladd_v1f64(x: f64x1, y: f64x1, z: f64x1) -> f64x1; + */ + #[link_name = "llvm.fmuladd.v2f64"] + fn fmuladd_v2f64(x: f64x2, y: f64x2, z: f64x2) -> f64x2; + #[link_name = "llvm.fmuladd.v4f64"] + fn fmuladd_v4f64(x: f64x4, y: f64x4, z: f64x4) -> f64x4; + #[link_name = "llvm.fmuladd.v8f64"] + fn fmuladd_v8f64(x: f64x8, y: f64x8, z: f64x8) -> f64x8; +} + +macro_rules! impl_mul_adde { + ($id:ident : $fn:ident) => { + impl MulAddE for $id { + #[inline] + fn mul_adde(self, y: Self, z: Self) -> Self { + #[cfg(not(target_arch = "s390x"))] + { + use crate::mem::transmute; + unsafe { + transmute($fn( + transmute(self), + transmute(y), + transmute(z), + )) + } + } + #[cfg(target_arch = "s390x")] + { + // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 + self * y + z + } + } + } + }; +} + +impl_mul_adde!(f32x2: fmuladd_v2f32); +impl_mul_adde!(f32x4: fmuladd_v4f32); +impl_mul_adde!(f32x8: fmuladd_v8f32); +impl_mul_adde!(f32x16: fmuladd_v16f32); +// impl_mul_adde!(f64x1: fma_v1f64); // FIXME 64-bit fmagle elem vectors +impl_mul_adde!(f64x2: fmuladd_v2f64); +impl_mul_adde!(f64x4: fmuladd_v4f64); +impl_mul_adde!(f64x8: fmuladd_v8f64); diff --git a/vendor/packed_simd_2/src/codegen/math/float/powf.rs b/vendor/packed_simd_2/src/codegen/math/float/powf.rs new file mode 100644 index 000000000..bc15067d7 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/math/float/powf.rs @@ -0,0 +1,112 @@ +//! Vertical floating-point `powf` +#![allow(unused)] + +// FIXME 64-bit powfgle elem vectors mispowfg + +use crate::*; + +crate trait Powf { + fn powf(self, x: Self) -> Self; +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.pow.v2f32"] + fn powf_v2f32(x: f32x2, y: f32x2) -> f32x2; + #[link_name = "llvm.pow.v4f32"] + fn powf_v4f32(x: f32x4, y: f32x4) -> f32x4; + #[link_name = "llvm.pow.v8f32"] + fn powf_v8f32(x: f32x8, y: f32x8) -> f32x8; + #[link_name = "llvm.pow.v16f32"] + fn powf_v16f32(x: f32x16, y: f32x16) -> f32x16; + /* FIXME 64-bit powfgle elem vectors + #[link_name = "llvm.pow.v1f64"] + fn powf_v1f64(x: f64x1, y: f64x1) -> f64x1; + */ + #[link_name = "llvm.pow.v2f64"] + fn powf_v2f64(x: f64x2, y: f64x2) -> f64x2; + #[link_name = "llvm.pow.v4f64"] + fn powf_v4f64(x: f64x4, y: f64x4) -> f64x4; + #[link_name = "llvm.pow.v8f64"] + fn powf_v8f64(x: f64x8, y: f64x8) -> f64x8; + + #[link_name = "llvm.pow.f32"] + fn powf_f32(x: f32, y: f32) -> f32; + #[link_name = "llvm.pow.f64"] + fn powf_f64(x: f64, y: f64) -> f64; +} + +gen_binary_impl_table!(Powf, powf); + +cfg_if! { + if #[cfg(target_arch = "s390x")] { + // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 + impl_binary!(f32x2[f32; 2]: powf_f32); + impl_binary!(f32x4[f32; 4]: powf_f32); + impl_binary!(f32x8[f32; 8]: powf_f32); + impl_binary!(f32x16[f32; 16]: powf_f32); + + impl_binary!(f64x2[f64; 2]: powf_f64); + impl_binary!(f64x4[f64; 4]: powf_f64); + impl_binary!(f64x8[f64; 8]: powf_f64); + } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { + use sleef_sys::*; + cfg_if! { + if #[cfg(target_feature = "avx2")] { + impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10avx2128); + impl_binary!(f32x16[h => f32x8]: Sleef_powf8_u10avx2); + impl_binary!(f64x8[h => f64x4]: Sleef_powd4_u10avx2); + + impl_binary!(f32x4: Sleef_powf4_u10avx2128); + impl_binary!(f32x8: Sleef_powf8_u10avx2); + impl_binary!(f64x2: Sleef_powd2_u10avx2128); + impl_binary!(f64x4: Sleef_powd4_u10avx2); + } else if #[cfg(target_feature = "avx")] { + impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10sse4); + impl_binary!(f32x16[h => f32x8]: Sleef_powf8_u10avx); + impl_binary!(f64x8[h => f64x4]: Sleef_powd4_u10avx); + + impl_binary!(f32x4: Sleef_powf4_u10sse4); + impl_binary!(f32x8: Sleef_powf8_u10avx); + impl_binary!(f64x2: Sleef_powd2_u10sse4); + impl_binary!(f64x4: Sleef_powd4_u10avx); + } else if #[cfg(target_feature = "sse4.2")] { + impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10sse4); + impl_binary!(f32x16[q => f32x4]: Sleef_powf4_u10sse4); + impl_binary!(f64x8[q => f64x2]: Sleef_powd2_u10sse4); + + impl_binary!(f32x4: Sleef_powf4_u10sse4); + impl_binary!(f32x8[h => f32x4]: Sleef_powf4_u10sse4); + impl_binary!(f64x2: Sleef_powd2_u10sse4); + impl_binary!(f64x4[h => f64x2]: Sleef_powd2_u10sse4); + } else if #[cfg(target_feature = "sse2")] { + impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10sse2); + impl_binary!(f32x16[q => f32x4]: Sleef_powf4_u10sse2); + impl_binary!(f64x8[q => f64x2]: Sleef_powd2_u10sse2); + + impl_binary!(f32x4: Sleef_powf4_u10sse2); + impl_binary!(f32x8[h => f32x4]: Sleef_powf4_u10sse2); + impl_binary!(f64x2: Sleef_powd2_u10sse2); + impl_binary!(f64x4[h => f64x2]: Sleef_powd2_u10sse2); + } else { + impl_binary!(f32x2[f32; 2]: powf_f32); + impl_binary!(f32x4: powf_v4f32); + impl_binary!(f32x8: powf_v8f32); + impl_binary!(f32x16: powf_v16f32); + + impl_binary!(f64x2: powf_v2f64); + impl_binary!(f64x4: powf_v4f64); + impl_binary!(f64x8: powf_v8f64); + } + } + } else { + impl_binary!(f32x2[f32; 2]: powf_f32); + impl_binary!(f32x4: powf_v4f32); + impl_binary!(f32x8: powf_v8f32); + impl_binary!(f32x16: powf_v16f32); + + impl_binary!(f64x2: powf_v2f64); + impl_binary!(f64x4: powf_v4f64); + impl_binary!(f64x8: powf_v8f64); + } +} diff --git a/vendor/packed_simd_2/src/codegen/math/float/sin.rs b/vendor/packed_simd_2/src/codegen/math/float/sin.rs new file mode 100644 index 000000000..7b014d07d --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/math/float/sin.rs @@ -0,0 +1,103 @@ +//! Vertical floating-point `sin` +#![allow(unused)] + +// FIXME 64-bit 1 elem vectors sin + +use crate::*; + +crate trait Sin { + fn sin(self) -> Self; +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.sin.v2f32"] + fn sin_v2f32(x: f32x2) -> f32x2; + #[link_name = "llvm.sin.v4f32"] + fn sin_v4f32(x: f32x4) -> f32x4; + #[link_name = "llvm.sin.v8f32"] + fn sin_v8f32(x: f32x8) -> f32x8; + #[link_name = "llvm.sin.v16f32"] + fn sin_v16f32(x: f32x16) -> f32x16; + /* FIXME 64-bit single elem vectors + #[link_name = "llvm.sin.v1f64"] + fn sin_v1f64(x: f64x1) -> f64x1; + */ + #[link_name = "llvm.sin.v2f64"] + fn sin_v2f64(x: f64x2) -> f64x2; + #[link_name = "llvm.sin.v4f64"] + fn sin_v4f64(x: f64x4) -> f64x4; + #[link_name = "llvm.sin.v8f64"] + fn sin_v8f64(x: f64x8) -> f64x8; + + #[link_name = "llvm.sin.f32"] + fn sin_f32(x: f32) -> f32; + #[link_name = "llvm.sin.f64"] + fn sin_f64(x: f64) -> f64; +} + +gen_unary_impl_table!(Sin, sin); + +cfg_if! { + if #[cfg(target_arch = "s390x")] { + // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 + impl_unary!(f32x2[f32; 2]: sin_f32); + impl_unary!(f32x4[f32; 4]: sin_f32); + impl_unary!(f32x8[f32; 8]: sin_f32); + impl_unary!(f32x16[f32; 16]: sin_f32); + + impl_unary!(f64x2[f64; 2]: sin_f64); + impl_unary!(f64x4[f64; 4]: sin_f64); + impl_unary!(f64x8[f64; 8]: sin_f64); + } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { + use sleef_sys::*; + cfg_if! { + if #[cfg(target_feature = "avx2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_sinf4_u10avx2128); + impl_unary!(f32x16[h => f32x8]: Sleef_sinf8_u10avx2); + impl_unary!(f64x8[h => f64x4]: Sleef_sind4_u10avx2); + + impl_unary!(f32x4: Sleef_sinf4_u10avx2128); + impl_unary!(f32x8: Sleef_sinf8_u10avx2); + impl_unary!(f64x2: Sleef_sind2_u10avx2128); + impl_unary!(f64x4: Sleef_sind4_u10avx2); + } else if #[cfg(target_feature = "avx")] { + impl_unary!(f32x2[t => f32x4]: Sleef_sinf4_u10sse4); + impl_unary!(f32x16[h => f32x8]: Sleef_sinf8_u10avx); + impl_unary!(f64x8[h => f64x4]: Sleef_sind4_u10avx); + + impl_unary!(f32x4: Sleef_sinf4_u10sse4); + impl_unary!(f32x8: Sleef_sinf8_u10avx); + impl_unary!(f64x2: Sleef_sind2_u10sse4); + impl_unary!(f64x4: Sleef_sind4_u10avx); + } else if #[cfg(target_feature = "sse4.2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_sinf4_u10sse4); + impl_unary!(f32x16[q => f32x4]: Sleef_sinf4_u10sse4); + impl_unary!(f64x8[q => f64x2]: Sleef_sind2_u10sse4); + + impl_unary!(f32x4: Sleef_sinf4_u10sse4); + impl_unary!(f32x8[h => f32x4]: Sleef_sinf4_u10sse4); + impl_unary!(f64x2: Sleef_sind2_u10sse4); + impl_unary!(f64x4[h => f64x2]: Sleef_sind2_u10sse4); + } else { + impl_unary!(f32x2[f32; 2]: sin_f32); + impl_unary!(f32x16: sin_v16f32); + impl_unary!(f64x8: sin_v8f64); + + impl_unary!(f32x4: sin_v4f32); + impl_unary!(f32x8: sin_v8f32); + impl_unary!(f64x2: sin_v2f64); + impl_unary!(f64x4: sin_v4f64); + } + } + } else { + impl_unary!(f32x2[f32; 2]: sin_f32); + impl_unary!(f32x4: sin_v4f32); + impl_unary!(f32x8: sin_v8f32); + impl_unary!(f32x16: sin_v16f32); + + impl_unary!(f64x2: sin_v2f64); + impl_unary!(f64x4: sin_v4f64); + impl_unary!(f64x8: sin_v8f64); + } +} diff --git a/vendor/packed_simd_2/src/codegen/math/float/sin_cos_pi.rs b/vendor/packed_simd_2/src/codegen/math/float/sin_cos_pi.rs new file mode 100644 index 000000000..0f1249ec8 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/math/float/sin_cos_pi.rs @@ -0,0 +1,195 @@ +//! Vertical floating-point `sin_cos` +#![allow(unused)] + +// FIXME 64-bit 1 elem vectors sin_cos + +use crate::*; + +crate trait SinCosPi: Sized { + type Output; + fn sin_cos_pi(self) -> Self::Output; +} + +macro_rules! impl_def { + ($vid:ident, $PI:path) => { + impl SinCosPi for $vid { + type Output = (Self, Self); + #[inline] + fn sin_cos_pi(self) -> Self::Output { + let v = self * Self::splat($PI); + (v.sin(), v.cos()) + } + } + }; +} + +macro_rules! impl_def32 { + ($vid:ident) => { + impl_def!($vid, crate::f32::consts::PI); + }; +} +macro_rules! impl_def64 { + ($vid:ident) => { + impl_def!($vid, crate::f64::consts::PI); + }; +} + +macro_rules! impl_unary_t { + ($vid:ident: $fun:ident) => { + impl SinCosPi for $vid { + type Output = (Self, Self); + fn sin_cos_pi(self) -> Self::Output { + unsafe { + use crate::mem::transmute; + transmute($fun(transmute(self))) + } + } + } + }; + ($vid:ident[t => $vid_t:ident]: $fun:ident) => { + impl SinCosPi for $vid { + type Output = (Self, Self); + fn sin_cos_pi(self) -> Self::Output { + unsafe { + use crate::mem::{transmute, uninitialized}; + + union U { + vec: [$vid; 2], + twice: $vid_t, + } + + let twice = U { vec: [self, uninitialized()] }.twice; + let twice = transmute($fun(transmute(twice))); + + union R { + twice: ($vid_t, $vid_t), + vecs: ([$vid; 2], [$vid; 2]), + } + let r = R { twice }.vecs; + (*r.0.get_unchecked(0), *r.0.get_unchecked(1)) + } + } + } + }; + ($vid:ident[h => $vid_h:ident]: $fun:ident) => { + impl SinCosPi for $vid { + type Output = (Self, Self); + fn sin_cos_pi(self) -> Self::Output { + unsafe { + use crate::mem::transmute; + + union U { + vec: $vid, + halves: [$vid_h; 2], + } + + let halves = U { vec: self }.halves; + + let res_0: ($vid_h, $vid_h) = + transmute($fun(transmute(*halves.get_unchecked(0)))); + let res_1: ($vid_h, $vid_h) = + transmute($fun(transmute(*halves.get_unchecked(1)))); + + union R { + result: ($vid, $vid), + halves: ([$vid_h; 2], [$vid_h; 2]), + } + R { halves: ([res_0.0, res_1.0], [res_0.1, res_1.1]) } + .result + } + } + } + }; + ($vid:ident[q => $vid_q:ident]: $fun:ident) => { + impl SinCosPi for $vid { + type Output = (Self, Self); + fn sin_cos_pi(self) -> Self::Output { + unsafe { + use crate::mem::transmute; + + union U { + vec: $vid, + quarters: [$vid_q; 4], + } + + let quarters = U { vec: self }.quarters; + + let res_0: ($vid_q, $vid_q) = + transmute($fun(transmute(*quarters.get_unchecked(0)))); + let res_1: ($vid_q, $vid_q) = + transmute($fun(transmute(*quarters.get_unchecked(1)))); + let res_2: ($vid_q, $vid_q) = + transmute($fun(transmute(*quarters.get_unchecked(2)))); + let res_3: ($vid_q, $vid_q) = + transmute($fun(transmute(*quarters.get_unchecked(3)))); + + union R { + result: ($vid, $vid), + quarters: ([$vid_q; 4], [$vid_q; 4]), + } + R { + quarters: ( + [res_0.0, res_1.0, res_2.0, res_3.0], + [res_0.1, res_1.1, res_2.1, res_3.1], + ), + } + .result + } + } + } + }; +} + +cfg_if! { + if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { + use sleef_sys::*; + cfg_if! { + if #[cfg(target_feature = "avx2")] { + impl_unary_t!(f32x2[t => f32x4]: Sleef_sincospif4_u05avx2128); + impl_unary_t!(f32x16[h => f32x8]: Sleef_sincospif8_u05avx2); + impl_unary_t!(f64x8[h => f64x4]: Sleef_sincospid4_u05avx2); + + impl_unary_t!(f32x4: Sleef_sincospif4_u05avx2128); + impl_unary_t!(f32x8: Sleef_sincospif8_u05avx2); + impl_unary_t!(f64x2: Sleef_sincospid2_u05avx2128); + impl_unary_t!(f64x4: Sleef_sincospid4_u05avx2); + } else if #[cfg(target_feature = "avx")] { + impl_unary_t!(f32x2[t => f32x4]: Sleef_sincospif4_u05sse4); + impl_unary_t!(f32x16[h => f32x8]: Sleef_sincospif8_u05avx); + impl_unary_t!(f64x8[h => f64x4]: Sleef_sincospid4_u05avx); + + impl_unary_t!(f32x4: Sleef_sincospif4_u05sse4); + impl_unary_t!(f32x8: Sleef_sincospif8_u05avx); + impl_unary_t!(f64x2: Sleef_sincospid2_u05sse4); + impl_unary_t!(f64x4: Sleef_sincospid4_u05avx); + } else if #[cfg(target_feature = "sse4.2")] { + impl_unary_t!(f32x2[t => f32x4]: Sleef_sincospif4_u05sse4); + impl_unary_t!(f32x16[q => f32x4]: Sleef_sincospif4_u05sse4); + impl_unary_t!(f64x8[q => f64x2]: Sleef_sincospid2_u05sse4); + + impl_unary_t!(f32x4: Sleef_sincospif4_u05sse4); + impl_unary_t!(f32x8[h => f32x4]: Sleef_sincospif4_u05sse4); + impl_unary_t!(f64x2: Sleef_sincospid2_u05sse4); + impl_unary_t!(f64x4[h => f64x2]: Sleef_sincospid2_u05sse4); + } else { + impl_def32!(f32x2); + impl_def32!(f32x4); + impl_def32!(f32x8); + impl_def32!(f32x16); + + impl_def64!(f64x2); + impl_def64!(f64x4); + impl_def64!(f64x8); + } + } + } else { + impl_def32!(f32x2); + impl_def32!(f32x4); + impl_def32!(f32x8); + impl_def32!(f32x16); + + impl_def64!(f64x2); + impl_def64!(f64x4); + impl_def64!(f64x8); + } +} diff --git a/vendor/packed_simd_2/src/codegen/math/float/sin_pi.rs b/vendor/packed_simd_2/src/codegen/math/float/sin_pi.rs new file mode 100644 index 000000000..72df98c93 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/math/float/sin_pi.rs @@ -0,0 +1,87 @@ +//! Vertical floating-point `sin_pi` +#![allow(unused)] + +// FIXME 64-bit 1 elem vectors sin_pi + +use crate::*; + +crate trait SinPi { + fn sin_pi(self) -> Self; +} + +gen_unary_impl_table!(SinPi, sin_pi); + +macro_rules! impl_def { + ($vid:ident, $PI:path) => { + impl SinPi for $vid { + #[inline] + fn sin_pi(self) -> Self { + (self * Self::splat($PI)).sin() + } + } + }; +} +macro_rules! impl_def32 { + ($vid:ident) => { + impl_def!($vid, crate::f32::consts::PI); + }; +} +macro_rules! impl_def64 { + ($vid:ident) => { + impl_def!($vid, crate::f64::consts::PI); + }; +} + +cfg_if! { + if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { + use sleef_sys::*; + cfg_if! { + if #[cfg(target_feature = "avx2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_sinpif4_u05avx2128); + impl_unary!(f32x16[h => f32x8]: Sleef_sinpif8_u05avx2); + impl_unary!(f64x8[h => f64x4]: Sleef_sinpid4_u05avx2); + + impl_unary!(f32x4: Sleef_sinpif4_u05avx2128); + impl_unary!(f32x8: Sleef_sinpif8_u05avx2); + impl_unary!(f64x2: Sleef_sinpid2_u05avx2128); + impl_unary!(f64x4: Sleef_sinpid4_u05avx2); + } else if #[cfg(target_feature = "avx")] { + impl_unary!(f32x2[t => f32x4]: Sleef_sinpif4_u05sse4); + impl_unary!(f32x16[h => f32x8]: Sleef_sinpif8_u05avx); + impl_unary!(f64x8[h => f64x4]: Sleef_sinpid4_u05avx); + + impl_unary!(f32x4: Sleef_sinpif4_u05sse4); + impl_unary!(f32x8: Sleef_sinpif8_u05avx); + impl_unary!(f64x2: Sleef_sinpid2_u05sse4); + impl_unary!(f64x4: Sleef_sinpid4_u05avx); + } else if #[cfg(target_feature = "sse4.2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_sinpif4_u05sse4); + impl_unary!(f32x16[q => f32x4]: Sleef_sinpif4_u05sse4); + impl_unary!(f64x8[q => f64x2]: Sleef_sinpid2_u05sse4); + + impl_unary!(f32x4: Sleef_sinpif4_u05sse4); + impl_unary!(f32x8[h => f32x4]: Sleef_sinpif4_u05sse4); + impl_unary!(f64x2: Sleef_sinpid2_u05sse4); + impl_unary!(f64x4[h => f64x2]: Sleef_sinpid2_u05sse4); + } else { + impl_def32!(f32x2); + impl_def32!(f32x4); + impl_def32!(f32x8); + impl_def32!(f32x16); + + impl_def64!(f64x2); + impl_def64!(f64x4); + impl_def64!(f64x8); + } + } + } else { + impl_def32!(f32x2); + impl_def32!(f32x4); + impl_def32!(f32x8); + impl_def32!(f32x16); + + impl_def64!(f64x2); + impl_def64!(f64x4); + impl_def64!(f64x8); + } +} diff --git a/vendor/packed_simd_2/src/codegen/math/float/sqrt.rs b/vendor/packed_simd_2/src/codegen/math/float/sqrt.rs new file mode 100644 index 000000000..7ce31df62 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/math/float/sqrt.rs @@ -0,0 +1,103 @@ +//! Vertical floating-point `sqrt` +#![allow(unused)] + +// FIXME 64-bit 1 elem vectors sqrt + +use crate::*; + +crate trait Sqrt { + fn sqrt(self) -> Self; +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.sqrt.v2f32"] + fn sqrt_v2f32(x: f32x2) -> f32x2; + #[link_name = "llvm.sqrt.v4f32"] + fn sqrt_v4f32(x: f32x4) -> f32x4; + #[link_name = "llvm.sqrt.v8f32"] + fn sqrt_v8f32(x: f32x8) -> f32x8; + #[link_name = "llvm.sqrt.v16f32"] + fn sqrt_v16f32(x: f32x16) -> f32x16; + /* FIXME 64-bit sqrtgle elem vectors + #[link_name = "llvm.sqrt.v1f64"] + fn sqrt_v1f64(x: f64x1) -> f64x1; + */ + #[link_name = "llvm.sqrt.v2f64"] + fn sqrt_v2f64(x: f64x2) -> f64x2; + #[link_name = "llvm.sqrt.v4f64"] + fn sqrt_v4f64(x: f64x4) -> f64x4; + #[link_name = "llvm.sqrt.v8f64"] + fn sqrt_v8f64(x: f64x8) -> f64x8; + + #[link_name = "llvm.sqrt.f32"] + fn sqrt_f32(x: f32) -> f32; + #[link_name = "llvm.sqrt.f64"] + fn sqrt_f64(x: f64) -> f64; +} + +gen_unary_impl_table!(Sqrt, sqrt); + +cfg_if! { + if #[cfg(target_arch = "s390x")] { + // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 + impl_unary!(f32x2[f32; 2]: sqrt_f32); + impl_unary!(f32x4[f32; 4]: sqrt_f32); + impl_unary!(f32x8[f32; 8]: sqrt_f32); + impl_unary!(f32x16[f32; 16]: sqrt_f32); + + impl_unary!(f64x2[f64; 2]: sqrt_f64); + impl_unary!(f64x4[f64; 4]: sqrt_f64); + impl_unary!(f64x8[f64; 8]: sqrt_f64); + } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { + use sleef_sys::*; + cfg_if! { + if #[cfg(target_feature = "avx2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_avx2128); + impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_avx2); + impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_avx2); + + impl_unary!(f32x4: Sleef_sqrtf4_avx2128); + impl_unary!(f32x8: Sleef_sqrtf8_avx2); + impl_unary!(f64x2: Sleef_sqrtd2_avx2128); + impl_unary!(f64x4: Sleef_sqrtd4_avx2); + } else if #[cfg(target_feature = "avx")] { + impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_sse4); + impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_avx); + impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_avx); + + impl_unary!(f32x4: Sleef_sqrtf4_sse4); + impl_unary!(f32x8: Sleef_sqrtf8_avx); + impl_unary!(f64x2: Sleef_sqrtd2_sse4); + impl_unary!(f64x4: Sleef_sqrtd4_avx); + } else if #[cfg(target_feature = "sse4.2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_sse4); + impl_unary!(f32x16[q => f32x4]: Sleef_sqrtf4_sse4); + impl_unary!(f64x8[q => f64x2]: Sleef_sqrtd2_sse4); + + impl_unary!(f32x4: Sleef_sqrtf4_sse4); + impl_unary!(f32x8[h => f32x4]: Sleef_sqrtf4_sse4); + impl_unary!(f64x2: Sleef_sqrtd2_sse4); + impl_unary!(f64x4[h => f64x2]: Sleef_sqrtd2_sse4); + } else { + impl_unary!(f32x2[f32; 2]: sqrt_f32); + impl_unary!(f32x16: sqrt_v16f32); + impl_unary!(f64x8: sqrt_v8f64); + + impl_unary!(f32x4: sqrt_v4f32); + impl_unary!(f32x8: sqrt_v8f32); + impl_unary!(f64x2: sqrt_v2f64); + impl_unary!(f64x4: sqrt_v4f64); + } + } + } else { + impl_unary!(f32x2[f32; 2]: sqrt_f32); + impl_unary!(f32x4: sqrt_v4f32); + impl_unary!(f32x8: sqrt_v8f32); + impl_unary!(f32x16: sqrt_v16f32); + + impl_unary!(f64x2: sqrt_v2f64); + impl_unary!(f64x4: sqrt_v4f64); + impl_unary!(f64x8: sqrt_v8f64); + } +} diff --git a/vendor/packed_simd_2/src/codegen/math/float/sqrte.rs b/vendor/packed_simd_2/src/codegen/math/float/sqrte.rs new file mode 100644 index 000000000..c1e379c34 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/math/float/sqrte.rs @@ -0,0 +1,67 @@ +//! Vertical floating-point `sqrt` +#![allow(unused)] + +// FIXME 64-bit 1 elem vectors sqrte + +use crate::llvm::simd_fsqrt; +use crate::*; + +crate trait Sqrte { + fn sqrte(self) -> Self; +} + +gen_unary_impl_table!(Sqrte, sqrte); + +cfg_if! { + if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { + use sleef_sys::*; + cfg_if! { + if #[cfg(target_feature = "avx2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_u35avx2128); + impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_u35avx2); + impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_u35avx2); + + impl_unary!(f32x4: Sleef_sqrtf4_u35avx2128); + impl_unary!(f32x8: Sleef_sqrtf8_u35avx2); + impl_unary!(f64x2: Sleef_sqrtd2_u35avx2128); + impl_unary!(f64x4: Sleef_sqrtd4_u35avx2); + } else if #[cfg(target_feature = "avx")] { + impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_u35sse4); + impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_u35avx); + impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_u35avx); + + impl_unary!(f32x4: Sleef_sqrtf4_u35sse4); + impl_unary!(f32x8: Sleef_sqrtf8_u35avx); + impl_unary!(f64x2: Sleef_sqrtd2_u35sse4); + impl_unary!(f64x4: Sleef_sqrtd4_u35avx); + } else if #[cfg(target_feature = "sse4.2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_u35sse4); + impl_unary!(f32x16[q => f32x4]: Sleef_sqrtf4_u35sse4); + impl_unary!(f64x8[q => f64x2]: Sleef_sqrtd2_u35sse4); + + impl_unary!(f32x4: Sleef_sqrtf4_u35sse4); + impl_unary!(f32x8[h => f32x4]: Sleef_sqrtf4_u35sse4); + impl_unary!(f64x2: Sleef_sqrtd2_u35sse4); + impl_unary!(f64x4[h => f64x2]: Sleef_sqrtd2_u35sse4); + } else { + impl_unary!(f32x2[g]: simd_fsqrt); + impl_unary!(f32x16[g]: simd_fsqrt); + impl_unary!(f64x8[g]: simd_fsqrt); + + impl_unary!(f32x4[g]: simd_fsqrt); + impl_unary!(f32x8[g]: simd_fsqrt); + impl_unary!(f64x2[g]: simd_fsqrt); + impl_unary!(f64x4[g]: simd_fsqrt); + } + } + } else { + impl_unary!(f32x2[g]: simd_fsqrt); + impl_unary!(f32x4[g]: simd_fsqrt); + impl_unary!(f32x8[g]: simd_fsqrt); + impl_unary!(f32x16[g]: simd_fsqrt); + + impl_unary!(f64x2[g]: simd_fsqrt); + impl_unary!(f64x4[g]: simd_fsqrt); + impl_unary!(f64x8[g]: simd_fsqrt); + } +} diff --git a/vendor/packed_simd_2/src/codegen/math/float/tanh.rs b/vendor/packed_simd_2/src/codegen/math/float/tanh.rs new file mode 100644 index 000000000..5220c7d10 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/math/float/tanh.rs @@ -0,0 +1,117 @@ +//! Vertical floating-point `tanh` +#![allow(unused)] + +// FIXME 64-bit 1 elem vectors tanh + +use crate::*; + +crate trait Tanh { + fn tanh(self) -> Self; +} + +macro_rules! define_tanh { + + ($name:ident, $basetype:ty, $simdtype:ty, $lanes:expr, $trait:path) => { + fn $name(x: $simdtype) -> $simdtype { + use core::intrinsics::transmute; + let mut buf: [$basetype; $lanes] = unsafe { transmute(x) }; + for elem in &mut buf { + *elem = <$basetype as $trait>::tanh(*elem); + } + unsafe { transmute(buf) } + } + }; + + (f32 => $name:ident, $type:ty, $lanes:expr) => { + define_tanh!($name, f32, $type, $lanes, libm::F32Ext); + }; + + (f64 => $name:ident, $type:ty, $lanes:expr) => { + define_tanh!($name, f64, $type, $lanes, libm::F64Ext); + }; +} + +// llvm does not seem to expose the hyperbolic versions of trigonometric functions; +// we thus call the classical rust versions on all of them (which stem from cmath). +define_tanh!(f32 => tanh_v2f32, f32x2, 2); +define_tanh!(f32 => tanh_v4f32, f32x4, 4); +define_tanh!(f32 => tanh_v8f32, f32x8, 8); +define_tanh!(f32 => tanh_v16f32, f32x16, 16); + +define_tanh!(f64 => tanh_v2f64, f64x2, 2); +define_tanh!(f64 => tanh_v4f64, f64x4, 4); +define_tanh!(f64 => tanh_v8f64, f64x8, 8); + +fn tanh_f32(x: f32) -> f32 { + libm::F32Ext::tanh(x) +} + +fn tanh_f64(x: f64) -> f64 { + libm::F64Ext::tanh(x) +} + +gen_unary_impl_table!(Tanh, tanh); + +cfg_if! { + if #[cfg(target_arch = "s390x")] { + // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 + impl_unary!(f32x2[f32; 2]: tanh_f32); + impl_unary!(f32x4[f32; 4]: tanh_f32); + impl_unary!(f32x8[f32; 8]: tanh_f32); + impl_unary!(f32x16[f32; 16]: tanh_f32); + + impl_unary!(f64x2[f64; 2]: tanh_f64); + impl_unary!(f64x4[f64; 4]: tanh_f64); + impl_unary!(f64x8[f64; 8]: tanh_f64); + } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { + use sleef_sys::*; + cfg_if! { + if #[cfg(target_feature = "avx2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_tanhf4_u10avx2128); + impl_unary!(f32x16[h => f32x8]: Sleef_tanhf8_u10avx2); + impl_unary!(f64x8[h => f64x4]: Sleef_tanhd4_u10avx2); + + impl_unary!(f32x4: Sleef_tanhf4_u10avx2128); + impl_unary!(f32x8: Sleef_tanhf8_u10avx2); + impl_unary!(f64x2: Sleef_tanhd2_u10avx2128); + impl_unary!(f64x4: Sleef_tanhd4_u10avx2); + } else if #[cfg(target_feature = "avx")] { + impl_unary!(f32x2[t => f32x4]: Sleef_tanhf4_u10sse4); + impl_unary!(f32x16[h => f32x8]: Sleef_tanhf8_u10avx); + impl_unary!(f64x8[h => f64x4]: Sleef_tanhd4_u10avx); + + impl_unary!(f32x4: Sleef_tanhf4_u10sse4); + impl_unary!(f32x8: Sleef_tanhf8_u10avx); + impl_unary!(f64x2: Sleef_tanhd2_u10sse4); + impl_unary!(f64x4: Sleef_tanhd4_u10avx); + } else if #[cfg(target_feature = "sse4.2")] { + impl_unary!(f32x2[t => f32x4]: Sleef_tanhf4_u10sse4); + impl_unary!(f32x16[q => f32x4]: Sleef_tanhf4_u10sse4); + impl_unary!(f64x8[q => f64x2]: Sleef_tanhd2_u10sse4); + + impl_unary!(f32x4: Sleef_tanhf4_u10sse4); + impl_unary!(f32x8[h => f32x4]: Sleef_tanhf4_u10sse4); + impl_unary!(f64x2: Sleef_tanhd2_u10sse4); + impl_unary!(f64x4[h => f64x2]: Sleef_tanhd2_u10sse4); + } else { + impl_unary!(f32x2[f32; 2]: tanh_f32); + impl_unary!(f32x16: tanh_v16f32); + impl_unary!(f64x8: tanh_v8f64); + + impl_unary!(f32x4: tanh_v4f32); + impl_unary!(f32x8: tanh_v8f32); + impl_unary!(f64x2: tanh_v2f64); + impl_unary!(f64x4: tanh_v4f64); + } + } + } else { + impl_unary!(f32x2[f32; 2]: tanh_f32); + impl_unary!(f32x4: tanh_v4f32); + impl_unary!(f32x8: tanh_v8f32); + impl_unary!(f32x16: tanh_v16f32); + + impl_unary!(f64x2: tanh_v2f64); + impl_unary!(f64x4: tanh_v4f64); + impl_unary!(f64x8: tanh_v8f64); + } +} diff --git a/vendor/packed_simd_2/src/codegen/pointer_sized_int.rs b/vendor/packed_simd_2/src/codegen/pointer_sized_int.rs new file mode 100644 index 000000000..39f493d3b --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/pointer_sized_int.rs @@ -0,0 +1,28 @@ +//! Provides `isize` and `usize` + +use cfg_if::cfg_if; + +cfg_if! { + if #[cfg(target_pointer_width = "8")] { + crate type isize_ = i8; + crate type usize_ = u8; + } else if #[cfg(target_pointer_width = "16")] { + crate type isize_ = i16; + crate type usize_ = u16; + } else if #[cfg(target_pointer_width = "32")] { + crate type isize_ = i32; + crate type usize_ = u32; + + } else if #[cfg(target_pointer_width = "64")] { + crate type isize_ = i64; + crate type usize_ = u64; + } else if #[cfg(target_pointer_width = "64")] { + crate type isize_ = i64; + crate type usize_ = u64; + } else if #[cfg(target_pointer_width = "128")] { + crate type isize_ = i128; + crate type usize_ = u128; + } else { + compile_error!("unsupported target_pointer_width"); + } +} diff --git a/vendor/packed_simd_2/src/codegen/reductions.rs b/vendor/packed_simd_2/src/codegen/reductions.rs new file mode 100644 index 000000000..7be4f5fab --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions.rs @@ -0,0 +1 @@ +crate mod mask; diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask.rs b/vendor/packed_simd_2/src/codegen/reductions/mask.rs new file mode 100644 index 000000000..97260c6d4 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask.rs @@ -0,0 +1,69 @@ +//! Code generation workaround for `all()` mask horizontal reduction. +//! +//! Works arround [LLVM bug 36702]. +//! +//! [LLVM bug 36702]: https://bugs.llvm.org/show_bug.cgi?id=36702 +#![allow(unused_macros)] + +use crate::*; + +crate trait All: crate::marker::Sized { + unsafe fn all(self) -> bool; +} + +crate trait Any: crate::marker::Sized { + unsafe fn any(self) -> bool; +} + +#[macro_use] +mod fallback_impl; + +cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + #[macro_use] + mod x86; + } else if #[cfg(all(target_arch = "arm", target_feature = "v7", + target_feature = "neon", + any(feature = "core_arch", libcore_neon)))] { + #[macro_use] + mod arm; + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { + #[macro_use] + mod aarch64; + } else { + #[macro_use] + mod fallback; + } +} + +impl_mask_reductions!(m8x2); +impl_mask_reductions!(m8x4); +impl_mask_reductions!(m8x8); +impl_mask_reductions!(m8x16); +impl_mask_reductions!(m8x32); +impl_mask_reductions!(m8x64); + +impl_mask_reductions!(m16x2); +impl_mask_reductions!(m16x4); +impl_mask_reductions!(m16x8); +impl_mask_reductions!(m16x16); +impl_mask_reductions!(m16x32); + +impl_mask_reductions!(m32x2); +impl_mask_reductions!(m32x4); +impl_mask_reductions!(m32x8); +impl_mask_reductions!(m32x16); + +// FIXME: 64-bit single element vector +// impl_mask_reductions!(m64x1); +impl_mask_reductions!(m64x2); +impl_mask_reductions!(m64x4); +impl_mask_reductions!(m64x8); + +impl_mask_reductions!(m128x1); +impl_mask_reductions!(m128x2); +impl_mask_reductions!(m128x4); + +impl_mask_reductions!(msizex2); +impl_mask_reductions!(msizex4); +impl_mask_reductions!(msizex8); diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/aarch64.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/aarch64.rs new file mode 100644 index 000000000..e9586eace --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask/aarch64.rs @@ -0,0 +1,71 @@ +//! Mask reductions implementation for `aarch64` targets + +/// 128-bit wide vectors +macro_rules! aarch64_128_neon_impl { + ($id:ident, $vmin:ident, $vmax:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "neon")] + unsafe fn all(self) -> bool { + use crate::arch::aarch64::$vmin; + $vmin(crate::mem::transmute(self)) != 0 + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "neon")] + unsafe fn any(self) -> bool { + use crate::arch::aarch64::$vmax; + $vmax(crate::mem::transmute(self)) != 0 + } + } + } +} + +/// 64-bit wide vectors +macro_rules! aarch64_64_neon_impl { + ($id:ident, $vec128:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "neon")] + unsafe fn all(self) -> bool { + // Duplicates the 64-bit vector into a 128-bit one and + // calls all on that. + union U { + halves: ($id, $id), + vec: $vec128, + } + U { + halves: (self, self), + }.vec.all() + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "neon")] + unsafe fn any(self) -> bool { + union U { + halves: ($id, $id), + vec: $vec128, + } + U { + halves: (self, self), + }.vec.any() + } + } + }; +} + +/// Mask reduction implementation for `aarch64` targets +macro_rules! impl_mask_reductions { + // 64-bit wide masks + (m8x8) => { aarch64_64_neon_impl!(m8x8, m8x16); }; + (m16x4) => { aarch64_64_neon_impl!(m16x4, m16x8); }; + (m32x2) => { aarch64_64_neon_impl!(m32x2, m32x4); }; + // 128-bit wide masks + (m8x16) => { aarch64_128_neon_impl!(m8x16, vminvq_u8, vmaxvq_u8); }; + (m16x8) => { aarch64_128_neon_impl!(m16x8, vminvq_u16, vmaxvq_u16); }; + (m32x4) => { aarch64_128_neon_impl!(m32x4, vminvq_u32, vmaxvq_u32); }; + // Fallback to LLVM's default code-generation: + ($id:ident) => { fallback_impl!($id); }; +} diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/arm.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/arm.rs new file mode 100644 index 000000000..1987af7a9 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask/arm.rs @@ -0,0 +1,54 @@ +//! Mask reductions implementation for `arm` targets + +/// Implementation for ARM + v7 + NEON for 64-bit or 128-bit wide vectors with +/// more than two elements. +macro_rules! arm_128_v7_neon_impl { + ($id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "v7,neon")] + unsafe fn all(self) -> bool { + use crate::arch::arm::$vpmin; + use crate::mem::transmute; + union U { + halves: ($half, $half), + vec: $id, + } + let halves = U { vec: self }.halves; + let h: $half = transmute($vpmin( + transmute(halves.0), + transmute(halves.1), + )); + h.all() + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "v7,neon")] + unsafe fn any(self) -> bool { + use crate::arch::arm::$vpmax; + use crate::mem::transmute; + union U { + halves: ($half, $half), + vec: $id, + } + let halves = U { vec: self }.halves; + let h: $half = transmute($vpmax( + transmute(halves.0), + transmute(halves.1), + )); + h.any() + } + } + }; +} + +/// Mask reduction implementation for `arm` targets +macro_rules! impl_mask_reductions { + // 128-bit wide masks + (m8x16) => { arm_128_v7_neon_impl!(m8x16, m8x8, vpmin_u8, vpmax_u8); }; + (m16x8) => { arm_128_v7_neon_impl!(m16x8, m16x4, vpmin_u16, vpmax_u16); }; + (m32x4) => { arm_128_v7_neon_impl!(m32x4, m32x2, vpmin_u32, vpmax_u32); }; + // Fallback to LLVM's default code-generation: + ($id:ident) => { fallback_impl!($id); }; +} diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/fallback.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/fallback.rs new file mode 100644 index 000000000..25e5c813a --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask/fallback.rs @@ -0,0 +1,6 @@ +//! Default mask reduction implementations. + +/// Default mask reduction implementation +macro_rules! impl_mask_reductions { + ($id:ident) => { fallback_impl!($id); }; +} diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/fallback_impl.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/fallback_impl.rs new file mode 100644 index 000000000..0d246e2fd --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask/fallback_impl.rs @@ -0,0 +1,237 @@ +//! Default implementation of a mask reduction for any target. + +macro_rules! fallback_to_other_impl { + ($id:ident, $other:ident) => { + impl All for $id { + #[inline] + unsafe fn all(self) -> bool { + let m: $other = crate::mem::transmute(self); + m.all() + } + } + impl Any for $id { + #[inline] + unsafe fn any(self) -> bool { + let m: $other = crate::mem::transmute(self); + m.any() + } + } + }; +} + +/// Fallback implementation. +macro_rules! fallback_impl { + // 16-bit wide masks: + (m8x2) => { + impl All for m8x2 { + #[inline] + unsafe fn all(self) -> bool { + let i: u16 = crate::mem::transmute(self); + i == u16::max_value() + } + } + impl Any for m8x2 { + #[inline] + unsafe fn any(self) -> bool { + let i: u16 = crate::mem::transmute(self); + i != 0 + } + } + }; + // 32-bit wide masks + (m8x4) => { + impl All for m8x4 { + #[inline] + unsafe fn all(self) -> bool { + let i: u32 = crate::mem::transmute(self); + i == u32::max_value() + } + } + impl Any for m8x4 { + #[inline] + unsafe fn any(self) -> bool { + let i: u32 = crate::mem::transmute(self); + i != 0 + } + } + }; + (m16x2) => { + fallback_to_other_impl!(m16x2, m8x4); + }; + // 64-bit wide masks: + (m8x8) => { + impl All for m8x8 { + #[inline] + unsafe fn all(self) -> bool { + let i: u64 = crate::mem::transmute(self); + i == u64::max_value() + } + } + impl Any for m8x8 { + #[inline] + unsafe fn any(self) -> bool { + let i: u64 = crate::mem::transmute(self); + i != 0 + } + } + }; + (m16x4) => { + fallback_to_other_impl!(m16x4, m8x8); + }; + (m32x2) => { + fallback_to_other_impl!(m32x2, m16x4); + }; + // FIXME: 64x1 maxk + // 128-bit wide masks: + (m8x16) => { + impl All for m8x16 { + #[inline] + unsafe fn all(self) -> bool { + let i: u128 = crate::mem::transmute(self); + i == u128::max_value() + } + } + impl Any for m8x16 { + #[inline] + unsafe fn any(self) -> bool { + let i: u128 = crate::mem::transmute(self); + i != 0 + } + } + }; + (m16x8) => { + fallback_to_other_impl!(m16x8, m8x16); + }; + (m32x4) => { + fallback_to_other_impl!(m32x4, m16x8); + }; + (m64x2) => { + fallback_to_other_impl!(m64x2, m32x4); + }; + (m128x1) => { + fallback_to_other_impl!(m128x1, m64x2); + }; + // 256-bit wide masks + (m8x32) => { + impl All for m8x32 { + #[inline] + unsafe fn all(self) -> bool { + let i: [u128; 2] = crate::mem::transmute(self); + let o: [u128; 2] = [u128::max_value(); 2]; + i == o + } + } + impl Any for m8x32 { + #[inline] + unsafe fn any(self) -> bool { + let i: [u128; 2] = crate::mem::transmute(self); + let o: [u128; 2] = [0; 2]; + i != o + } + } + }; + (m16x16) => { + fallback_to_other_impl!(m16x16, m8x32); + }; + (m32x8) => { + fallback_to_other_impl!(m32x8, m16x16); + }; + (m64x4) => { + fallback_to_other_impl!(m64x4, m32x8); + }; + (m128x2) => { + fallback_to_other_impl!(m128x2, m64x4); + }; + // 512-bit wide masks + (m8x64) => { + impl All for m8x64 { + #[inline] + unsafe fn all(self) -> bool { + let i: [u128; 4] = crate::mem::transmute(self); + let o: [u128; 4] = [u128::max_value(); 4]; + i == o + } + } + impl Any for m8x64 { + #[inline] + unsafe fn any(self) -> bool { + let i: [u128; 4] = crate::mem::transmute(self); + let o: [u128; 4] = [0; 4]; + i != o + } + } + }; + (m16x32) => { + fallback_to_other_impl!(m16x32, m8x64); + }; + (m32x16) => { + fallback_to_other_impl!(m32x16, m16x32); + }; + (m64x8) => { + fallback_to_other_impl!(m64x8, m32x16); + }; + (m128x4) => { + fallback_to_other_impl!(m128x4, m64x8); + }; + // Masks with pointer-sized elements64 + (msizex2) => { + cfg_if! { + if #[cfg(target_pointer_width = "64")] { + fallback_to_other_impl!(msizex2, m64x2); + } else if #[cfg(target_pointer_width = "32")] { + fallback_to_other_impl!(msizex2, m32x2); + } else { + compile_error!("unsupported target_pointer_width"); + } + } + }; + (msizex4) => { + cfg_if! { + if #[cfg(target_pointer_width = "64")] { + fallback_to_other_impl!(msizex4, m64x4); + } else if #[cfg(target_pointer_width = "32")] { + fallback_to_other_impl!(msizex4, m32x4); + } else { + compile_error!("unsupported target_pointer_width"); + } + } + }; + (msizex8) => { + cfg_if! { + if #[cfg(target_pointer_width = "64")] { + fallback_to_other_impl!(msizex8, m64x8); + } else if #[cfg(target_pointer_width = "32")] { + fallback_to_other_impl!(msizex8, m32x8); + } else { + compile_error!("unsupported target_pointer_width"); + } + } + }; +} + +macro_rules! recurse_half { + ($vid:ident, $vid_h:ident) => { + impl All for $vid { + #[inline] + unsafe fn all(self) -> bool { + union U { + halves: ($vid_h, $vid_h), + vec: $vid, + } + let halves = U { vec: self }.halves; + halves.0.all() && halves.1.all() + } + } + impl Any for $vid { + #[inline] + unsafe fn any(self) -> bool { + union U { + halves: ($vid_h, $vid_h), + vec: $vid, + } + let halves = U { vec: self }.halves; + halves.0.any() || halves.1.any() + } + } + }; +} diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/x86.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/x86.rs new file mode 100644 index 000000000..bcfb1a6e1 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask/x86.rs @@ -0,0 +1,188 @@ +//! Mask reductions implementation for `x86` and `x86_64` targets + +#[cfg(target_feature = "sse")] +#[macro_use] +mod sse; + +#[cfg(target_feature = "sse2")] +#[macro_use] +mod sse2; + +#[cfg(target_feature = "avx")] +#[macro_use] +mod avx; + +#[cfg(target_feature = "avx2")] +#[macro_use] +mod avx2; + +/// x86 64-bit m8x8 implementation +macro_rules! x86_m8x8_impl { + ($id:ident) => { + fallback_impl!($id); + }; +} + +/// x86 128-bit m8x16 implementation +macro_rules! x86_m8x16_impl { + ($id:ident) => { + cfg_if! { + if #[cfg(target_feature = "sse2")] { + x86_m8x16_sse2_impl!($id); + } else { + fallback_impl!($id); + } + } + }; +} + +/// x86 128-bit m32x4 implementation +macro_rules! x86_m32x4_impl { + ($id:ident) => { + cfg_if! { + if #[cfg(target_feature = "sse")] { + x86_m32x4_sse_impl!($id); + } else { + fallback_impl!($id); + } + } + }; +} + +/// x86 128-bit m64x2 implementation +macro_rules! x86_m64x2_impl { + ($id:ident) => { + cfg_if! { + if #[cfg(target_feature = "sse2")] { + x86_m64x2_sse2_impl!($id); + } else if #[cfg(target_feature = "sse")] { + x86_m32x4_sse_impl!($id); + } else { + fallback_impl!($id); + } + } + }; +} + +/// x86 256-bit m8x32 implementation +macro_rules! x86_m8x32_impl { + ($id:ident, $half_id:ident) => { + cfg_if! { + if #[cfg(target_feature = "avx2")] { + x86_m8x32_avx2_impl!($id); + } else if #[cfg(target_feature = "avx")] { + x86_m8x32_avx_impl!($id); + } else if #[cfg(target_feature = "sse2")] { + recurse_half!($id, $half_id); + } else { + fallback_impl!($id); + } + } + }; +} + +/// x86 256-bit m32x8 implementation +macro_rules! x86_m32x8_impl { + ($id:ident, $half_id:ident) => { + cfg_if! { + if #[cfg(target_feature = "avx")] { + x86_m32x8_avx_impl!($id); + } else if #[cfg(target_feature = "sse")] { + recurse_half!($id, $half_id); + } else { + fallback_impl!($id); + } + } + }; +} + +/// x86 256-bit m64x4 implementation +macro_rules! x86_m64x4_impl { + ($id:ident, $half_id:ident) => { + cfg_if! { + if #[cfg(target_feature = "avx")] { + x86_m64x4_avx_impl!($id); + } else if #[cfg(target_feature = "sse")] { + recurse_half!($id, $half_id); + } else { + fallback_impl!($id); + } + } + }; +} + +/// Fallback implementation. +macro_rules! x86_intr_impl { + ($id:ident) => { + impl All for $id { + #[inline] + unsafe fn all(self) -> bool { + use crate::llvm::simd_reduce_all; + simd_reduce_all(self.0) + } + } + impl Any for $id { + #[inline] + unsafe fn any(self) -> bool { + use crate::llvm::simd_reduce_any; + simd_reduce_any(self.0) + } + } + }; +} + +/// Mask reduction implementation for `x86` and `x86_64` targets +macro_rules! impl_mask_reductions { + // 64-bit wide masks + (m8x8) => { x86_m8x8_impl!(m8x8); }; + (m16x4) => { x86_m8x8_impl!(m16x4); }; + (m32x2) => { x86_m8x8_impl!(m32x2); }; + // 128-bit wide masks + (m8x16) => { x86_m8x16_impl!(m8x16); }; + (m16x8) => { x86_m8x16_impl!(m16x8); }; + (m32x4) => { x86_m32x4_impl!(m32x4); }; + (m64x2) => { x86_m64x2_impl!(m64x2); }; + (m128x1) => { x86_intr_impl!(m128x1); }; + // 256-bit wide masks: + (m8x32) => { x86_m8x32_impl!(m8x32, m8x16); }; + (m16x16) => { x86_m8x32_impl!(m16x16, m16x8); }; + (m32x8) => { x86_m32x8_impl!(m32x8, m32x4); }; + (m64x4) => { x86_m64x4_impl!(m64x4, m64x2); }; + (m128x2) => { x86_intr_impl!(m128x2); }; + (msizex2) => { + cfg_if! { + if #[cfg(target_pointer_width = "64")] { + fallback_to_other_impl!(msizex2, m64x2); + } else if #[cfg(target_pointer_width = "32")] { + fallback_to_other_impl!(msizex2, m32x2); + } else { + compile_error!("unsupported target_pointer_width"); + } + } + }; + (msizex4) => { + cfg_if! { + if #[cfg(target_pointer_width = "64")] { + fallback_to_other_impl!(msizex4, m64x4); + } else if #[cfg(target_pointer_width = "32")] { + fallback_to_other_impl!(msizex4, m32x4); + } else { + compile_error!("unsupported target_pointer_width"); + } + } + }; + (msizex8) => { + cfg_if! { + if #[cfg(target_pointer_width = "64")] { + fallback_to_other_impl!(msizex8, m64x8); + } else if #[cfg(target_pointer_width = "32")] { + fallback_to_other_impl!(msizex8, m32x8); + } else { + compile_error!("unsupported target_pointer_width"); + } + } + }; + + // Fallback to LLVM's default code-generation: + ($id:ident) => { fallback_impl!($id); }; +} diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx.rs new file mode 100644 index 000000000..d18736fb0 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx.rs @@ -0,0 +1,101 @@ +//! Mask reductions implementation for `x86` and `x86_64` targets with `AVX` + +/// `x86`/`x86_64` 256-bit `AVX` implementation +/// FIXME: it might be faster here to do two `_mm_movmask_epi8` +#[cfg(target_feature = "avx")] +macro_rules! x86_m8x32_avx_impl { + ($id:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "avx")] + unsafe fn all(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm256_testc_si256; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm256_testc_si256; + _mm256_testc_si256( + crate::mem::transmute(self), + crate::mem::transmute($id::splat(true)), + ) != 0 + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "avx")] + unsafe fn any(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm256_testz_si256; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm256_testz_si256; + _mm256_testz_si256( + crate::mem::transmute(self), + crate::mem::transmute(self), + ) == 0 + } + } + }; +} + +/// `x86`/`x86_64` 256-bit m32x8 `AVX` implementation +macro_rules! x86_m32x8_avx_impl { + ($id:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "sse")] + unsafe fn all(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm256_movemask_ps; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm256_movemask_ps; + // _mm256_movemask_ps(a) creates a 8bit mask containing the + // most significant bit of each lane of `a`. If all bits are + // set, then all 8 lanes of the mask are true. + _mm256_movemask_ps(crate::mem::transmute(self)) == 0b_1111_1111_i32 + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "sse")] + unsafe fn any(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm256_movemask_ps; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm256_movemask_ps; + + _mm256_movemask_ps(crate::mem::transmute(self)) != 0 + } + } + }; +} + +/// `x86`/`x86_64` 256-bit m64x4 `AVX` implementation +macro_rules! x86_m64x4_avx_impl { + ($id:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "sse")] + unsafe fn all(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm256_movemask_pd; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm256_movemask_pd; + // _mm256_movemask_pd(a) creates a 4bit mask containing the + // most significant bit of each lane of `a`. If all bits are + // set, then all 4 lanes of the mask are true. + _mm256_movemask_pd(crate::mem::transmute(self)) == 0b_1111_i32 + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "sse")] + unsafe fn any(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm256_movemask_pd; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm256_movemask_pd; + + _mm256_movemask_pd(crate::mem::transmute(self)) != 0 + } + } + }; +} diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx2.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx2.rs new file mode 100644 index 000000000..d37d02342 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx2.rs @@ -0,0 +1,35 @@ +//! Mask reductions implementation for `x86` and `x86_64` targets with `AVX2`. +#![allow(unused)] + +/// x86/x86_64 256-bit m8x32 AVX2 implementation +macro_rules! x86_m8x32_avx2_impl { + ($id:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn all(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm256_movemask_epi8; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm256_movemask_epi8; + // _mm256_movemask_epi8(a) creates a 32bit mask containing the + // most significant bit of each byte of `a`. If all + // bits are set, then all 32 lanes of the mask are + // true. + _mm256_movemask_epi8(crate::mem::transmute(self)) == -1_i32 + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn any(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm256_movemask_epi8; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm256_movemask_epi8; + + _mm256_movemask_epi8(crate::mem::transmute(self)) != 0 + } + } + }; +} diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse.rs new file mode 100644 index 000000000..eb1ef7fac --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse.rs @@ -0,0 +1,36 @@ +//! Mask reductions implementation for `x86` and `x86_64` targets with `SSE`. +#![allow(unused)] + +/// `x86`/`x86_64` 128-bit `m32x4` `SSE` implementation +macro_rules! x86_m32x4_sse_impl { + ($id:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "sse")] + unsafe fn all(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm_movemask_ps; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm_movemask_ps; + // _mm_movemask_ps(a) creates a 4bit mask containing the + // most significant bit of each lane of `a`. If all + // bits are set, then all 4 lanes of the mask are + // true. + _mm_movemask_ps(crate::mem::transmute(self)) + == 0b_1111_i32 + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "sse")] + unsafe fn any(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm_movemask_ps; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm_movemask_ps; + + _mm_movemask_ps(crate::mem::transmute(self)) != 0 + } + } + }; +} diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse2.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse2.rs new file mode 100644 index 000000000..a99c606f5 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse2.rs @@ -0,0 +1,70 @@ +//! Mask reductions implementation for `x86` and `x86_64` targets with `SSE2`. +#![allow(unused)] + +/// `x86`/`x86_64` 128-bit m64x2 `SSE2` implementation +macro_rules! x86_m64x2_sse2_impl { + ($id:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "sse")] + unsafe fn all(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm_movemask_pd; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm_movemask_pd; + // _mm_movemask_pd(a) creates a 2bit mask containing the + // most significant bit of each lane of `a`. If all + // bits are set, then all 2 lanes of the mask are + // true. + _mm_movemask_pd(crate::mem::transmute(self)) + == 0b_11_i32 + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "sse")] + unsafe fn any(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm_movemask_pd; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm_movemask_pd; + + _mm_movemask_pd(crate::mem::transmute(self)) != 0 + } + } + }; +} + +/// `x86`/`x86_64` 128-bit m8x16 `SSE2` implementation +macro_rules! x86_m8x16_sse2_impl { + ($id:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn all(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm_movemask_epi8; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm_movemask_epi8; + // _mm_movemask_epi8(a) creates a 16bit mask containing the + // most significant bit of each byte of `a`. If all + // bits are set, then all 16 lanes of the mask are + // true. + _mm_movemask_epi8(crate::mem::transmute(self)) + == i32::from(u16::max_value()) + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn any(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm_movemask_epi8; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm_movemask_epi8; + + _mm_movemask_epi8(crate::mem::transmute(self)) != 0 + } + } + }; +} diff --git a/vendor/packed_simd_2/src/codegen/shuffle.rs b/vendor/packed_simd_2/src/codegen/shuffle.rs new file mode 100644 index 000000000..d92c9ee22 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/shuffle.rs @@ -0,0 +1,150 @@ +//! Implementations of the `ShuffleResult` trait for the different numbers of +//! lanes and vector element types. + +use crate::masks::*; +use crate::sealed::{Shuffle, Seal}; + +macro_rules! impl_shuffle { + ($array:ty, $base:ty, $out:ty) => { + impl Seal<$array> for $base {} + impl Shuffle<$array> for $base { + type Output = $out; + } + } +} + +impl_shuffle! { [u32; 2], i8, crate::codegen::i8x2 } +impl_shuffle! { [u32; 4], i8, crate::codegen::i8x4 } +impl_shuffle! { [u32; 8], i8, crate::codegen::i8x8 } +impl_shuffle! { [u32; 16], i8, crate::codegen::i8x16 } +impl_shuffle! { [u32; 32], i8, crate::codegen::i8x32 } +impl_shuffle! { [u32; 64], i8, crate::codegen::i8x64 } + +impl_shuffle! { [u32; 2], u8, crate::codegen::u8x2 } +impl_shuffle! { [u32; 4], u8, crate::codegen::u8x4 } +impl_shuffle! { [u32; 8], u8, crate::codegen::u8x8 } +impl_shuffle! { [u32; 16], u8, crate::codegen::u8x16 } +impl_shuffle! { [u32; 32], u8, crate::codegen::u8x32 } +impl_shuffle! { [u32; 64], u8, crate::codegen::u8x64 } + +impl_shuffle! { [u32; 2], m8, crate::codegen::m8x2 } +impl_shuffle! { [u32; 4], m8, crate::codegen::m8x4 } +impl_shuffle! { [u32; 8], m8, crate::codegen::m8x8 } +impl_shuffle! { [u32; 16], m8, crate::codegen::m8x16 } +impl_shuffle! { [u32; 32], m8, crate::codegen::m8x32 } +impl_shuffle! { [u32; 64], m8, crate::codegen::m8x64 } + +impl_shuffle! { [u32; 2], i16, crate::codegen::i16x2 } +impl_shuffle! { [u32; 4], i16, crate::codegen::i16x4 } +impl_shuffle! { [u32; 8], i16, crate::codegen::i16x8 } +impl_shuffle! { [u32; 16], i16, crate::codegen::i16x16 } +impl_shuffle! { [u32; 32], i16, crate::codegen::i16x32 } + +impl_shuffle! { [u32; 2], u16, crate::codegen::u16x2 } +impl_shuffle! { [u32; 4], u16, crate::codegen::u16x4 } +impl_shuffle! { [u32; 8], u16, crate::codegen::u16x8 } +impl_shuffle! { [u32; 16], u16, crate::codegen::u16x16 } +impl_shuffle! { [u32; 32], u16, crate::codegen::u16x32 } + +impl_shuffle! { [u32; 2], m16, crate::codegen::m16x2 } +impl_shuffle! { [u32; 4], m16, crate::codegen::m16x4 } +impl_shuffle! { [u32; 8], m16, crate::codegen::m16x8 } +impl_shuffle! { [u32; 16], m16, crate::codegen::m16x16 } + +impl_shuffle! { [u32; 2], i32, crate::codegen::i32x2 } +impl_shuffle! { [u32; 4], i32, crate::codegen::i32x4 } +impl_shuffle! { [u32; 8], i32, crate::codegen::i32x8 } +impl_shuffle! { [u32; 16], i32, crate::codegen::i32x16 } + +impl_shuffle! { [u32; 2], u32, crate::codegen::u32x2 } +impl_shuffle! { [u32; 4], u32, crate::codegen::u32x4 } +impl_shuffle! { [u32; 8], u32, crate::codegen::u32x8 } +impl_shuffle! { [u32; 16], u32, crate::codegen::u32x16 } + +impl_shuffle! { [u32; 2], f32, crate::codegen::f32x2 } +impl_shuffle! { [u32; 4], f32, crate::codegen::f32x4 } +impl_shuffle! { [u32; 8], f32, crate::codegen::f32x8 } +impl_shuffle! { [u32; 16], f32, crate::codegen::f32x16 } + +impl_shuffle! { [u32; 2], m32, crate::codegen::m32x2 } +impl_shuffle! { [u32; 4], m32, crate::codegen::m32x4 } +impl_shuffle! { [u32; 8], m32, crate::codegen::m32x8 } +impl_shuffle! { [u32; 16], m32, crate::codegen::m32x16 } + +/* FIXME: 64-bit single element vector +impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 } +*/ +impl_shuffle! { [u32; 2], i64, crate::codegen::i64x2 } +impl_shuffle! { [u32; 4], i64, crate::codegen::i64x4 } +impl_shuffle! { [u32; 8], i64, crate::codegen::i64x8 } + +/* FIXME: 64-bit single element vector +impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 } +*/ +impl_shuffle! { [u32; 2], u64, crate::codegen::u64x2 } +impl_shuffle! { [u32; 4], u64, crate::codegen::u64x4 } +impl_shuffle! { [u32; 8], u64, crate::codegen::u64x8 } + +/* FIXME: 64-bit single element vector +impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 } +*/ +impl_shuffle! { [u32; 2], f64, crate::codegen::f64x2 } +impl_shuffle! { [u32; 4], f64, crate::codegen::f64x4 } +impl_shuffle! { [u32; 8], f64, crate::codegen::f64x8 } + +/* FIXME: 64-bit single element vector +impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 } +*/ +impl_shuffle! { [u32; 2], m64, crate::codegen::m64x2 } +impl_shuffle! { [u32; 4], m64, crate::codegen::m64x4 } +impl_shuffle! { [u32; 8], m64, crate::codegen::m64x8 } + +impl_shuffle! { [u32; 2], isize, crate::codegen::isizex2 } +impl_shuffle! { [u32; 4], isize, crate::codegen::isizex4 } +impl_shuffle! { [u32; 8], isize, crate::codegen::isizex8 } + +impl_shuffle! { [u32; 2], usize, crate::codegen::usizex2 } +impl_shuffle! { [u32; 4], usize, crate::codegen::usizex4 } +impl_shuffle! { [u32; 8], usize, crate::codegen::usizex8 } + +impl_shuffle! { [u32; 2], msize, crate::codegen::msizex2 } +impl_shuffle! { [u32; 4], msize, crate::codegen::msizex4 } +impl_shuffle! { [u32; 8], msize, crate::codegen::msizex8 } + +impl Seal<[u32; 2]> for *const T {} +impl Shuffle<[u32; 2]> for *const T { + type Output = crate::codegen::cptrx2; +} +impl Seal<[u32; 4]> for *const T {} +impl Shuffle<[u32; 4]> for *const T { + type Output = crate::codegen::cptrx4; +} +impl Seal<[u32; 8]> for *const T {} +impl Shuffle<[u32; 8]> for *const T { + type Output = crate::codegen::cptrx8; +} + +impl Seal<[u32; 2]> for *mut T {} +impl Shuffle<[u32; 2]> for *mut T { + type Output = crate::codegen::mptrx2; +} +impl Seal<[u32; 4]> for *mut T {} +impl Shuffle<[u32; 4]> for *mut T { + type Output = crate::codegen::mptrx4; +} +impl Seal<[u32; 8]> for *mut T {} +impl Shuffle<[u32; 8]> for *mut T { + type Output = crate::codegen::mptrx8; +} + +impl_shuffle! { [u32; 1], i128, crate::codegen::i128x1 } +impl_shuffle! { [u32; 2], i128, crate::codegen::i128x2 } +impl_shuffle! { [u32; 4], i128, crate::codegen::i128x4 } + +impl_shuffle! { [u32; 1], u128, crate::codegen::u128x1 } +impl_shuffle! { [u32; 2], u128, crate::codegen::u128x2 } +impl_shuffle! { [u32; 4], u128, crate::codegen::u128x4 } + +impl_shuffle! { [u32; 1], m128, crate::codegen::m128x1 } +impl_shuffle! { [u32; 2], m128, crate::codegen::m128x2 } +impl_shuffle! { [u32; 4], m128, crate::codegen::m128x4 } diff --git a/vendor/packed_simd_2/src/codegen/shuffle1_dyn.rs b/vendor/packed_simd_2/src/codegen/shuffle1_dyn.rs new file mode 100644 index 000000000..8d9577b26 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/shuffle1_dyn.rs @@ -0,0 +1,411 @@ +//! Shuffle vector lanes with run-time indices. + +use crate::*; + +pub trait Shuffle1Dyn { + type Indices; + fn shuffle1_dyn(self, _: Self::Indices) -> Self; +} + +// Fallback implementation +macro_rules! impl_fallback { + ($id:ident) => { + impl Shuffle1Dyn for $id { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + let mut result = Self::splat(0); + for i in 0..$id::lanes() { + result = result + .replace(i, self.extract(indices.extract(i) as usize)); + } + result + } + } + }; +} + +macro_rules! impl_shuffle1_dyn { + (u8x8) => { + cfg_if! { + if #[cfg(all( + any( + all(target_arch = "aarch64", target_feature = "neon"), + all(target_arch = "arm", target_feature = "v7", + target_feature = "neon") + ), + any(feature = "core_arch", libcore_neon) + ) + )] { + impl Shuffle1Dyn for u8x8 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + #[cfg(target_arch = "aarch64")] + use crate::arch::aarch64::vtbl1_u8; + #[cfg(target_arch = "arm")] + use crate::arch::arm::vtbl1_u8; + + // This is safe because the binary is compiled with + // neon enabled at compile-time and can therefore only + // run on CPUs that have it enabled. + unsafe { + Simd(mem::transmute( + vtbl1_u8(mem::transmute(self.0), + crate::mem::transmute(indices.0)) + )) + } + } + } + } else { + impl_fallback!(u8x8); + } + } + }; + (u8x16) => { + cfg_if! { + if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "ssse3"))] { + impl Shuffle1Dyn for u8x16 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm_shuffle_epi8; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm_shuffle_epi8; + // This is safe because the binary is compiled with + // ssse3 enabled at compile-time and can therefore only + // run on CPUs that have it enabled. + unsafe { + Simd(mem::transmute( + _mm_shuffle_epi8(mem::transmute(self.0), + crate::mem::transmute(indices)) + )) + } + } + } + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon", + any(feature = "core_arch", libcore_neon)))] { + impl Shuffle1Dyn for u8x16 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + use crate::arch::aarch64::vqtbl1q_u8; + + // This is safe because the binary is compiled with + // neon enabled at compile-time and can therefore only + // run on CPUs that have it enabled. + unsafe { + Simd(mem::transmute( + vqtbl1q_u8(mem::transmute(self.0), + crate::mem::transmute(indices.0)) + )) + } + } + } + } else if #[cfg(all(target_arch = "arm", target_feature = "v7", + target_feature = "neon", + any(feature = "core_arch", libcore_neon)))] { + impl Shuffle1Dyn for u8x16 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + use crate::arch::arm::vtbl2_u8; + + // This is safe because the binary is compiled with + // neon enabled at compile-time and can therefore only + // run on CPUs that have it enabled. + unsafe { + union U { + j: u8x16, + s: (u8x8, u8x8), + } + + let (i0, i1) = U { j: y }.s; + + let r0 = vtbl2_u8( + mem::transmute(x), + crate::mem::transmute(i0) + ); + let r1 = vtbl2_u8( + mem::transmute(x), + crate::mem::transmute(i1) + ); + + let r = U { s: (r0, r1) }.j; + + Simd(mem::transmute(r)) + } + } + } + } else { + impl_fallback!(u8x16); + } + } + }; + (u16x8) => { + impl Shuffle1Dyn for u16x8 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + let indices: u8x8 = (indices * 2).cast(); + let indices: u8x16 = shuffle!( + indices, [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7] + ); + let v = u8x16::new( + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 + ); + let indices = indices + v; + unsafe { + let s: u8x16 =crate::mem::transmute(self); + crate::mem::transmute(s.shuffle1_dyn(indices)) + } + } + } + }; + (u32x4) => { + cfg_if! { + if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "avx"))] { + impl Shuffle1Dyn for u32x4 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + #[cfg(target_arch = "x86")] + use crate::arch::x86::{_mm_permutevar_ps}; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::{_mm_permutevar_ps}; + + unsafe { + crate::mem::transmute( + _mm_permutevar_ps( + crate::mem::transmute(self.0), + crate::mem::transmute(indices.0) + ) + ) + } + } + } + } else { + impl Shuffle1Dyn for u32x4 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + let indices: u8x4 = (indices * 4).cast(); + let indices: u8x16 = shuffle!( + indices, + [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3] + ); + let v = u8x16::new( + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 + ); + let indices = indices + v; + unsafe { + let s: u8x16 =crate::mem::transmute(self); + crate::mem::transmute(s.shuffle1_dyn(indices)) + } + } + } + } + } + }; + (u64x2) => { + cfg_if! { + if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "avx"))] { + impl Shuffle1Dyn for u64x2 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + #[cfg(target_arch = "x86")] + use crate::arch::x86::{_mm_permutevar_pd}; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::{_mm_permutevar_pd}; + // _mm_permutevar_pd uses the _second_ bit of each + // element to perform the selection, that is: 0b00 => 0, + // 0b10 => 1: + let indices = indices << 1; + unsafe { + crate::mem::transmute( + _mm_permutevar_pd( + crate::mem::transmute(self), + crate::mem::transmute(indices) + ) + ) + } + } + } + } else { + impl Shuffle1Dyn for u64x2 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + let indices: u8x2 = (indices * 8).cast(); + let indices: u8x16 = shuffle!( + indices, + [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + ); + let v = u8x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 + ); + let indices = indices + v; + unsafe { + let s: u8x16 =crate::mem::transmute(self); + crate::mem::transmute(s.shuffle1_dyn(indices)) + } + } + } + } + } + }; + (u128x1) => { + impl Shuffle1Dyn for u128x1 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, _indices: Self::Indices) -> Self { + self + } + } + }; + ($id:ident) => { impl_fallback!($id); } +} + +impl_shuffle1_dyn!(u8x2); +impl_shuffle1_dyn!(u8x4); +impl_shuffle1_dyn!(u8x8); +impl_shuffle1_dyn!(u8x16); +impl_shuffle1_dyn!(u8x32); +impl_shuffle1_dyn!(u8x64); + +impl_shuffle1_dyn!(u16x2); +impl_shuffle1_dyn!(u16x4); +impl_shuffle1_dyn!(u16x8); +impl_shuffle1_dyn!(u16x16); +impl_shuffle1_dyn!(u16x32); + +impl_shuffle1_dyn!(u32x2); +impl_shuffle1_dyn!(u32x4); +impl_shuffle1_dyn!(u32x8); +impl_shuffle1_dyn!(u32x16); + +impl_shuffle1_dyn!(u64x2); +impl_shuffle1_dyn!(u64x4); +impl_shuffle1_dyn!(u64x8); + +impl_shuffle1_dyn!(usizex2); +impl_shuffle1_dyn!(usizex4); +impl_shuffle1_dyn!(usizex8); + +impl_shuffle1_dyn!(u128x1); +impl_shuffle1_dyn!(u128x2); +impl_shuffle1_dyn!(u128x4); + +// Implementation for non-unsigned vector types +macro_rules! impl_shuffle1_dyn_non_u { + ($id:ident, $uid:ident) => { + impl Shuffle1Dyn for $id { + type Indices = $uid; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + unsafe { + let u: $uid = crate::mem::transmute(self); + crate::mem::transmute(u.shuffle1_dyn(indices)) + } + } + } + }; +} + +impl_shuffle1_dyn_non_u!(i8x2, u8x2); +impl_shuffle1_dyn_non_u!(i8x4, u8x4); +impl_shuffle1_dyn_non_u!(i8x8, u8x8); +impl_shuffle1_dyn_non_u!(i8x16, u8x16); +impl_shuffle1_dyn_non_u!(i8x32, u8x32); +impl_shuffle1_dyn_non_u!(i8x64, u8x64); + +impl_shuffle1_dyn_non_u!(i16x2, u16x2); +impl_shuffle1_dyn_non_u!(i16x4, u16x4); +impl_shuffle1_dyn_non_u!(i16x8, u16x8); +impl_shuffle1_dyn_non_u!(i16x16, u16x16); +impl_shuffle1_dyn_non_u!(i16x32, u16x32); + +impl_shuffle1_dyn_non_u!(i32x2, u32x2); +impl_shuffle1_dyn_non_u!(i32x4, u32x4); +impl_shuffle1_dyn_non_u!(i32x8, u32x8); +impl_shuffle1_dyn_non_u!(i32x16, u32x16); + +impl_shuffle1_dyn_non_u!(i64x2, u64x2); +impl_shuffle1_dyn_non_u!(i64x4, u64x4); +impl_shuffle1_dyn_non_u!(i64x8, u64x8); + +impl_shuffle1_dyn_non_u!(isizex2, usizex2); +impl_shuffle1_dyn_non_u!(isizex4, usizex4); +impl_shuffle1_dyn_non_u!(isizex8, usizex8); + +impl_shuffle1_dyn_non_u!(i128x1, u128x1); +impl_shuffle1_dyn_non_u!(i128x2, u128x2); +impl_shuffle1_dyn_non_u!(i128x4, u128x4); + +impl_shuffle1_dyn_non_u!(m8x2, u8x2); +impl_shuffle1_dyn_non_u!(m8x4, u8x4); +impl_shuffle1_dyn_non_u!(m8x8, u8x8); +impl_shuffle1_dyn_non_u!(m8x16, u8x16); +impl_shuffle1_dyn_non_u!(m8x32, u8x32); +impl_shuffle1_dyn_non_u!(m8x64, u8x64); + +impl_shuffle1_dyn_non_u!(m16x2, u16x2); +impl_shuffle1_dyn_non_u!(m16x4, u16x4); +impl_shuffle1_dyn_non_u!(m16x8, u16x8); +impl_shuffle1_dyn_non_u!(m16x16, u16x16); +impl_shuffle1_dyn_non_u!(m16x32, u16x32); + +impl_shuffle1_dyn_non_u!(m32x2, u32x2); +impl_shuffle1_dyn_non_u!(m32x4, u32x4); +impl_shuffle1_dyn_non_u!(m32x8, u32x8); +impl_shuffle1_dyn_non_u!(m32x16, u32x16); + +impl_shuffle1_dyn_non_u!(m64x2, u64x2); +impl_shuffle1_dyn_non_u!(m64x4, u64x4); +impl_shuffle1_dyn_non_u!(m64x8, u64x8); + +impl_shuffle1_dyn_non_u!(msizex2, usizex2); +impl_shuffle1_dyn_non_u!(msizex4, usizex4); +impl_shuffle1_dyn_non_u!(msizex8, usizex8); + +impl_shuffle1_dyn_non_u!(m128x1, u128x1); +impl_shuffle1_dyn_non_u!(m128x2, u128x2); +impl_shuffle1_dyn_non_u!(m128x4, u128x4); + +impl_shuffle1_dyn_non_u!(f32x2, u32x2); +impl_shuffle1_dyn_non_u!(f32x4, u32x4); +impl_shuffle1_dyn_non_u!(f32x8, u32x8); +impl_shuffle1_dyn_non_u!(f32x16, u32x16); + +impl_shuffle1_dyn_non_u!(f64x2, u64x2); +impl_shuffle1_dyn_non_u!(f64x4, u64x4); +impl_shuffle1_dyn_non_u!(f64x8, u64x8); + +// Implementation for non-unsigned vector types +macro_rules! impl_shuffle1_dyn_ptr { + ($id:ident, $uid:ident) => { + impl Shuffle1Dyn for $id { + type Indices = $uid; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + unsafe { + let u: $uid = crate::mem::transmute(self); + crate::mem::transmute(u.shuffle1_dyn(indices)) + } + } + } + }; +} + +impl_shuffle1_dyn_ptr!(cptrx2, usizex2); +impl_shuffle1_dyn_ptr!(cptrx4, usizex4); +impl_shuffle1_dyn_ptr!(cptrx8, usizex8); + +impl_shuffle1_dyn_ptr!(mptrx2, usizex2); +impl_shuffle1_dyn_ptr!(mptrx4, usizex4); +impl_shuffle1_dyn_ptr!(mptrx8, usizex8); diff --git a/vendor/packed_simd_2/src/codegen/swap_bytes.rs b/vendor/packed_simd_2/src/codegen/swap_bytes.rs new file mode 100644 index 000000000..b435fb5da --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/swap_bytes.rs @@ -0,0 +1,189 @@ +//! Horizontal swap bytes reductions. + +// FIXME: investigate using `llvm.bswap` +// https://github.com/rust-lang-nursery/packed_simd/issues/19 + +use crate::*; + +crate trait SwapBytes { + fn swap_bytes(self) -> Self; +} + +macro_rules! impl_swap_bytes { + (v16: $($id:ident,)+) => { + $( + impl SwapBytes for $id { + #[inline] + fn swap_bytes(self) -> Self { + unsafe { shuffle!(self, [1, 0]) } + } + } + )+ + }; + (v32: $($id:ident,)+) => { + $( + impl SwapBytes for $id { + #[inline] + #[allow(clippy::useless_transmute)] + fn swap_bytes(self) -> Self { + unsafe { + let bytes: u8x4 = crate::mem::transmute(self); + let result: u8x4 = shuffle!(bytes, [3, 2, 1, 0]); + crate::mem::transmute(result) + } + } + } + )+ + }; + (v64: $($id:ident,)+) => { + $( + impl SwapBytes for $id { + #[inline] + #[allow(clippy::useless_transmute)] + fn swap_bytes(self) -> Self { + unsafe { + let bytes: u8x8 = crate::mem::transmute(self); + let result: u8x8 = shuffle!( + bytes, [7, 6, 5, 4, 3, 2, 1, 0] + ); + crate::mem::transmute(result) + } + } + } + )+ + }; + (v128: $($id:ident,)+) => { + $( + impl SwapBytes for $id { + #[inline] + #[allow(clippy::useless_transmute)] + fn swap_bytes(self) -> Self { + unsafe { + let bytes: u8x16 = crate::mem::transmute(self); + let result: u8x16 = shuffle!(bytes, [ + 15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0 + ]); + crate::mem::transmute(result) + } + } + } + )+ + }; + (v256: $($id:ident,)+) => { + $( + impl SwapBytes for $id { + #[inline] + #[allow(clippy::useless_transmute)] + fn swap_bytes(self) -> Self { + unsafe { + let bytes: u8x32 = crate::mem::transmute(self); + let result: u8x32 = shuffle!(bytes, [ + 31, 30, 29, 28, 27, 26, 25, 24, + 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0 + ]); + crate::mem::transmute(result) + } + } + } + )+ + }; + (v512: $($id:ident,)+) => { + $( + impl SwapBytes for $id { + #[inline] + #[allow(clippy::useless_transmute)] + fn swap_bytes(self) -> Self { + unsafe { + let bytes: u8x64 = crate::mem::transmute(self); + let result: u8x64 = shuffle!(bytes, [ + 63, 62, 61, 60, 59, 58, 57, 56, + 55, 54, 53, 52, 51, 50, 49, 48, + 47, 46, 45, 44, 43, 42, 41, 40, + 39, 38, 37, 36, 35, 34, 33, 32, + 31, 30, 29, 28, 27, 26, 25, 24, + 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0 + ]); + crate::mem::transmute(result) + } + } + } + )+ + }; +} + +impl_swap_bytes!(v16: u8x2, i8x2,); +impl_swap_bytes!(v32: u8x4, i8x4, u16x2, i16x2,); +// FIXME: 64-bit single element vector +impl_swap_bytes!( + v64: u8x8, + i8x8, + u16x4, + i16x4, + u32x2, + i32x2, /* u64x1, i64x1, */ +); + +impl_swap_bytes!( + v128: u8x16, + i8x16, + u16x8, + i16x8, + u32x4, + i32x4, + u64x2, + i64x2, + u128x1, + i128x1, +); +impl_swap_bytes!( + v256: u8x32, + i8x32, + u16x16, + i16x16, + u32x8, + i32x8, + u64x4, + i64x4, + u128x2, + i128x2, +); + +impl_swap_bytes!( + v512: u8x64, + i8x64, + u16x32, + i16x32, + u32x16, + i32x16, + u64x8, + i64x8, + u128x4, + i128x4, +); + +cfg_if! { + if #[cfg(target_pointer_width = "8")] { + impl_swap_bytes!(v16: isizex2, usizex2,); + impl_swap_bytes!(v32: isizex4, usizex4,); + impl_swap_bytes!(v64: isizex8, usizex8,); + } else if #[cfg(target_pointer_width = "16")] { + impl_swap_bytes!(v32: isizex2, usizex2,); + impl_swap_bytes!(v64: isizex4, usizex4,); + impl_swap_bytes!(v128: isizex8, usizex8,); + } else if #[cfg(target_pointer_width = "32")] { + impl_swap_bytes!(v64: isizex2, usizex2,); + impl_swap_bytes!(v128: isizex4, usizex4,); + impl_swap_bytes!(v256: isizex8, usizex8,); + } else if #[cfg(target_pointer_width = "64")] { + impl_swap_bytes!(v128: isizex2, usizex2,); + impl_swap_bytes!(v256: isizex4, usizex4,); + impl_swap_bytes!(v512: isizex8, usizex8,); + } else { + compile_error!("unsupported target_pointer_width"); + } +} diff --git a/vendor/packed_simd_2/src/codegen/v128.rs b/vendor/packed_simd_2/src/codegen/v128.rs new file mode 100644 index 000000000..9506424fa --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/v128.rs @@ -0,0 +1,46 @@ +//! Internal 128-bit wide vector types + +use crate::masks::*; + +#[rustfmt::skip] +impl_simd_array!( + [i8; 16]: i8x16 | + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8 +); +#[rustfmt::skip] +impl_simd_array!( + [u8; 16]: u8x16 | + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8 +); +#[rustfmt::skip] +impl_simd_array!( + [m8; 16]: m8x16 | + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8 +); + +impl_simd_array!([i16; 8]: i16x8 | i16, i16, i16, i16, i16, i16, i16, i16); +impl_simd_array!([u16; 8]: u16x8 | u16, u16, u16, u16, u16, u16, u16, u16); +impl_simd_array!([m16; 8]: m16x8 | i16, i16, i16, i16, i16, i16, i16, i16); + +impl_simd_array!([i32; 4]: i32x4 | i32, i32, i32, i32); +impl_simd_array!([u32; 4]: u32x4 | u32, u32, u32, u32); +impl_simd_array!([f32; 4]: f32x4 | f32, f32, f32, f32); +impl_simd_array!([m32; 4]: m32x4 | i32, i32, i32, i32); + +impl_simd_array!([i64; 2]: i64x2 | i64, i64); +impl_simd_array!([u64; 2]: u64x2 | u64, u64); +impl_simd_array!([f64; 2]: f64x2 | f64, f64); +impl_simd_array!([m64; 2]: m64x2 | i64, i64); + +impl_simd_array!([i128; 1]: i128x1 | i128); +impl_simd_array!([u128; 1]: u128x1 | u128); +impl_simd_array!([m128; 1]: m128x1 | i128); diff --git a/vendor/packed_simd_2/src/codegen/v16.rs b/vendor/packed_simd_2/src/codegen/v16.rs new file mode 100644 index 000000000..4d55a6d89 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/v16.rs @@ -0,0 +1,7 @@ +//! Internal 16-bit wide vector types + +use crate::masks::*; + +impl_simd_array!([i8; 2]: i8x2 | i8, i8); +impl_simd_array!([u8; 2]: u8x2 | u8, u8); +impl_simd_array!([m8; 2]: m8x2 | i8, i8); diff --git a/vendor/packed_simd_2/src/codegen/v256.rs b/vendor/packed_simd_2/src/codegen/v256.rs new file mode 100644 index 000000000..5ca4759f0 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/v256.rs @@ -0,0 +1,78 @@ +//! Internal 256-bit wide vector types + +use crate::masks::*; + +#[rustfmt::skip] +impl_simd_array!( + [i8; 32]: i8x32 | + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8 +); +#[rustfmt::skip] +impl_simd_array!( + [u8; 32]: u8x32 | + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8 +); +#[rustfmt::skip] +impl_simd_array!( + [m8; 32]: m8x32 | + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8 +); +#[rustfmt::skip] +impl_simd_array!( + [i16; 16]: i16x16 | + i16, i16, i16, i16, + i16, i16, i16, i16, + i16, i16, i16, i16, + i16, i16, i16, i16 +); +#[rustfmt::skip] +impl_simd_array!( + [u16; 16]: u16x16 | + u16, u16, u16, u16, + u16, u16, u16, u16, + u16, u16, u16, u16, + u16, u16, u16, u16 +); +#[rustfmt::skip] +impl_simd_array!( + [m16; 16]: m16x16 | + i16, i16, i16, i16, + i16, i16, i16, i16, + i16, i16, i16, i16, + i16, i16, i16, i16 +); + +impl_simd_array!([i32; 8]: i32x8 | i32, i32, i32, i32, i32, i32, i32, i32); +impl_simd_array!([u32; 8]: u32x8 | u32, u32, u32, u32, u32, u32, u32, u32); +impl_simd_array!([f32; 8]: f32x8 | f32, f32, f32, f32, f32, f32, f32, f32); +impl_simd_array!([m32; 8]: m32x8 | i32, i32, i32, i32, i32, i32, i32, i32); + +impl_simd_array!([i64; 4]: i64x4 | i64, i64, i64, i64); +impl_simd_array!([u64; 4]: u64x4 | u64, u64, u64, u64); +impl_simd_array!([f64; 4]: f64x4 | f64, f64, f64, f64); +impl_simd_array!([m64; 4]: m64x4 | i64, i64, i64, i64); + +impl_simd_array!([i128; 2]: i128x2 | i128, i128); +impl_simd_array!([u128; 2]: u128x2 | u128, u128); +impl_simd_array!([m128; 2]: m128x2 | i128, i128); diff --git a/vendor/packed_simd_2/src/codegen/v32.rs b/vendor/packed_simd_2/src/codegen/v32.rs new file mode 100644 index 000000000..ae1dabd00 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/v32.rs @@ -0,0 +1,11 @@ +//! Internal 32-bit wide vector types + +use crate::masks::*; + +impl_simd_array!([i8; 4]: i8x4 | i8, i8, i8, i8); +impl_simd_array!([u8; 4]: u8x4 | u8, u8, u8, u8); +impl_simd_array!([m8; 4]: m8x4 | i8, i8, i8, i8); + +impl_simd_array!([i16; 2]: i16x2 | i16, i16); +impl_simd_array!([u16; 2]: u16x2 | u16, u16); +impl_simd_array!([m16; 2]: m16x2 | i16, i16); diff --git a/vendor/packed_simd_2/src/codegen/v512.rs b/vendor/packed_simd_2/src/codegen/v512.rs new file mode 100644 index 000000000..bf9511034 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/v512.rs @@ -0,0 +1,145 @@ +//! Internal 512-bit wide vector types + +use crate::masks::*; + +#[rustfmt::skip] +impl_simd_array!( + [i8; 64]: i8x64 | + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8 +); +#[rustfmt::skip] +impl_simd_array!( + [u8; 64]: u8x64 | + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8, + + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8, + u8, u8, u8, u8 +); +#[rustfmt::skip] +impl_simd_array!( + [m8; 64]: m8x64 | + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8, + i8, i8, i8, i8 +); +#[rustfmt::skip] +impl_simd_array!( + [i16; 32]: i16x32 | + i16, i16, i16, i16, + i16, i16, i16, i16, + i16, i16, i16, i16, + i16, i16, i16, i16, + i16, i16, i16, i16, + i16, i16, i16, i16, + i16, i16, i16, i16, + i16, i16, i16, i16 +); +#[rustfmt::skip] +impl_simd_array!( + [u16; 32]: u16x32 | + u16, u16, u16, u16, + u16, u16, u16, u16, + u16, u16, u16, u16, + u16, u16, u16, u16, + u16, u16, u16, u16, + u16, u16, u16, u16, + u16, u16, u16, u16, + u16, u16, u16, u16 +); +#[rustfmt::skip] +impl_simd_array!( + [m16; 32]: m16x32 | + i16, i16, i16, i16, + i16, i16, i16, i16, + i16, i16, i16, i16, + i16, i16, i16, i16, + i16, i16, i16, i16, + i16, i16, i16, i16, + i16, i16, i16, i16, + i16, i16, i16, i16 +); + +#[rustfmt::skip] +impl_simd_array!( + [i32; 16]: i32x16 | + i32, i32, i32, i32, + i32, i32, i32, i32, + i32, i32, i32, i32, + i32, i32, i32, i32 +); +#[rustfmt::skip] +impl_simd_array!( + [u32; 16]: u32x16 | + u32, u32, u32, u32, + u32, u32, u32, u32, + u32, u32, u32, u32, + u32, u32, u32, u32 +); +#[rustfmt::skip] +impl_simd_array!( + [f32; 16]: f32x16 | + f32, f32, f32, f32, + f32, f32, f32, f32, + f32, f32, f32, f32, + f32, f32, f32, f32 +); +#[rustfmt::skip] +impl_simd_array!( + [m32; 16]: m32x16 | + i32, i32, i32, i32, + i32, i32, i32, i32, + i32, i32, i32, i32, + i32, i32, i32, i32 +); + +impl_simd_array!([i64; 8]: i64x8 | i64, i64, i64, i64, i64, i64, i64, i64); +impl_simd_array!([u64; 8]: u64x8 | u64, u64, u64, u64, u64, u64, u64, u64); +impl_simd_array!([f64; 8]: f64x8 | f64, f64, f64, f64, f64, f64, f64, f64); +impl_simd_array!([m64; 8]: m64x8 | i64, i64, i64, i64, i64, i64, i64, i64); + +impl_simd_array!([i128; 4]: i128x4 | i128, i128, i128, i128); +impl_simd_array!([u128; 4]: u128x4 | u128, u128, u128, u128); +impl_simd_array!([m128; 4]: m128x4 | i128, i128, i128, i128); diff --git a/vendor/packed_simd_2/src/codegen/v64.rs b/vendor/packed_simd_2/src/codegen/v64.rs new file mode 100644 index 000000000..3cfb67c1a --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/v64.rs @@ -0,0 +1,21 @@ +//! Internal 64-bit wide vector types + +use crate::masks::*; + +impl_simd_array!([i8; 8]: i8x8 | i8, i8, i8, i8, i8, i8, i8, i8); +impl_simd_array!([u8; 8]: u8x8 | u8, u8, u8, u8, u8, u8, u8, u8); +impl_simd_array!([m8; 8]: m8x8 | i8, i8, i8, i8, i8, i8, i8, i8); + +impl_simd_array!([i16; 4]: i16x4 | i16, i16, i16, i16); +impl_simd_array!([u16; 4]: u16x4 | u16, u16, u16, u16); +impl_simd_array!([m16; 4]: m16x4 | i16, i16, i16, i16); + +impl_simd_array!([i32; 2]: i32x2 | i32, i32); +impl_simd_array!([u32; 2]: u32x2 | u32, u32); +impl_simd_array!([f32; 2]: f32x2 | f32, f32); +impl_simd_array!([m32; 2]: m32x2 | i32, i32); + +impl_simd_array!([i64; 1]: i64x1 | i64); +impl_simd_array!([u64; 1]: u64x1 | u64); +impl_simd_array!([f64; 1]: f64x1 | f64); +impl_simd_array!([m64; 1]: m64x1 | i64); diff --git a/vendor/packed_simd_2/src/codegen/vPtr.rs b/vendor/packed_simd_2/src/codegen/vPtr.rs new file mode 100644 index 000000000..cf4765538 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/vPtr.rs @@ -0,0 +1,35 @@ +//! Pointer vector types + +macro_rules! impl_simd_ptr { + ([$ptr_ty:ty; $elem_count:expr]: $tuple_id:ident | $ty:ident + | $($tys:ty),*) => { + #[derive(Copy, Clone)] + #[repr(simd)] + pub struct $tuple_id<$ty>($(crate $tys),*); + //^^^^^^^ leaked through SimdArray + + impl<$ty> crate::sealed::Seal for [$ptr_ty; $elem_count] {} + impl<$ty> crate::sealed::SimdArray for [$ptr_ty; $elem_count] { + type Tuple = $tuple_id<$ptr_ty>; + type T = $ptr_ty; + const N: usize = $elem_count; + type NT = [u32; $elem_count]; + } + + impl<$ty> crate::sealed::Seal for $tuple_id<$ptr_ty> {} + impl<$ty> crate::sealed::Simd for $tuple_id<$ptr_ty> { + type Element = $ptr_ty; + const LANES: usize = $elem_count; + type LanesType = [u32; $elem_count]; + } + + } +} + +impl_simd_ptr!([*const T; 2]: cptrx2 | T | T, T); +impl_simd_ptr!([*const T; 4]: cptrx4 | T | T, T, T, T); +impl_simd_ptr!([*const T; 8]: cptrx8 | T | T, T, T, T, T, T, T, T); + +impl_simd_ptr!([*mut T; 2]: mptrx2 | T | T, T); +impl_simd_ptr!([*mut T; 4]: mptrx4 | T | T, T, T, T); +impl_simd_ptr!([*mut T; 8]: mptrx8 | T | T, T, T, T, T, T, T, T); diff --git a/vendor/packed_simd_2/src/codegen/vSize.rs b/vendor/packed_simd_2/src/codegen/vSize.rs new file mode 100644 index 000000000..3911b2134 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/vSize.rs @@ -0,0 +1,43 @@ +//! Vector types with pointer-sized elements + +use crate::codegen::pointer_sized_int::{isize_, usize_}; +use crate::masks::*; + +impl_simd_array!([isize; 2]: isizex2 | isize_, isize_); +impl_simd_array!([usize; 2]: usizex2 | usize_, usize_); +impl_simd_array!([msize; 2]: msizex2 | isize_, isize_); + +impl_simd_array!([isize; 4]: isizex4 | isize_, isize_, isize_, isize_); +impl_simd_array!([usize; 4]: usizex4 | usize_, usize_, usize_, usize_); +impl_simd_array!([msize; 4]: msizex4 | isize_, isize_, isize_, isize_); + +impl_simd_array!( + [isize; 8]: isizex8 | isize_, + isize_, + isize_, + isize_, + isize_, + isize_, + isize_, + isize_ +); +impl_simd_array!( + [usize; 8]: usizex8 | usize_, + usize_, + usize_, + usize_, + usize_, + usize_, + usize_, + usize_ +); +impl_simd_array!( + [msize; 8]: msizex8 | isize_, + isize_, + isize_, + isize_, + isize_, + isize_, + isize_, + isize_ +); -- cgit v1.2.3