From ef24de24a82fe681581cc130f342363c47c0969a Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 7 Jun 2024 07:48:48 +0200 Subject: Merging upstream version 1.75.0+dfsg1. Signed-off-by: Daniel Baumann --- vendor/packed_simd/src/codegen/shuffle1_dyn.rs | 408 +++++++++++++++++++++++++ 1 file changed, 408 insertions(+) create mode 100644 vendor/packed_simd/src/codegen/shuffle1_dyn.rs (limited to 'vendor/packed_simd/src/codegen/shuffle1_dyn.rs') diff --git a/vendor/packed_simd/src/codegen/shuffle1_dyn.rs b/vendor/packed_simd/src/codegen/shuffle1_dyn.rs new file mode 100644 index 000000000..19d457a45 --- /dev/null +++ b/vendor/packed_simd/src/codegen/shuffle1_dyn.rs @@ -0,0 +1,408 @@ +//! Shuffle vector lanes with run-time indices. + +use crate::*; + +pub trait Shuffle1Dyn { + type Indices; + fn shuffle1_dyn(self, _: Self::Indices) -> Self; +} + +// Fallback implementation +macro_rules! impl_fallback { + ($id:ident) => { + impl Shuffle1Dyn for $id { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + let mut result = Self::splat(0); + for i in 0..$id::lanes() { + result = result.replace(i, self.extract(indices.extract(i) as usize)); + } + result + } + } + }; +} + +macro_rules! impl_shuffle1_dyn { + (u8x8) => { + cfg_if! { + if #[cfg(all( + any( + all(target_arch = "aarch64", target_feature = "neon"), + all(target_arch = "doesnotexist", target_feature = "v7", + target_feature = "neon") + ), + any(feature = "core_arch", libcore_neon) + ) + )] { + impl Shuffle1Dyn for u8x8 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + #[cfg(target_arch = "aarch64")] + use crate::arch::aarch64::vtbl1_u8; + #[cfg(target_arch = "doesnotexist")] + use crate::arch::arm::vtbl1_u8; + + // This is safe because the binary is compiled with + // neon enabled at compile-time and can therefore only + // run on CPUs that have it enabled. + unsafe { + Simd(mem::transmute( + vtbl1_u8(mem::transmute(self.0), + crate::mem::transmute(indices.0)) + )) + } + } + } + } else { + impl_fallback!(u8x8); + } + } + }; + (u8x16) => { + cfg_if! { + if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "ssse3"))] { + impl Shuffle1Dyn for u8x16 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm_shuffle_epi8; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm_shuffle_epi8; + // This is safe because the binary is compiled with + // ssse3 enabled at compile-time and can therefore only + // run on CPUs that have it enabled. + unsafe { + Simd(mem::transmute( + _mm_shuffle_epi8(mem::transmute(self.0), + crate::mem::transmute(indices)) + )) + } + } + } + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon", + any(feature = "core_arch", libcore_neon)))] { + impl Shuffle1Dyn for u8x16 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + use crate::arch::aarch64::vqtbl1q_u8; + + // This is safe because the binary is compiled with + // neon enabled at compile-time and can therefore only + // run on CPUs that have it enabled. + unsafe { + Simd(mem::transmute( + vqtbl1q_u8(mem::transmute(self.0), + crate::mem::transmute(indices.0)) + )) + } + } + } + } else if #[cfg(all(target_arch = "doesnotexist", target_feature = "v7", + target_feature = "neon", + any(feature = "core_arch", libcore_neon)))] { + impl Shuffle1Dyn for u8x16 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + use crate::arch::arm::vtbl2_u8; + + // This is safe because the binary is compiled with + // neon enabled at compile-time and can therefore only + // run on CPUs that have it enabled. + unsafe { + union U { + j: u8x16, + s: (u8x8, u8x8), + } + + let (i0, i1) = U { j: y }.s; + + let r0 = vtbl2_u8( + mem::transmute(x), + crate::mem::transmute(i0) + ); + let r1 = vtbl2_u8( + mem::transmute(x), + crate::mem::transmute(i1) + ); + + let r = U { s: (r0, r1) }.j; + + Simd(mem::transmute(r)) + } + } + } + } else { + impl_fallback!(u8x16); + } + } + }; + (u16x8) => { + impl Shuffle1Dyn for u16x8 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + let indices: u8x8 = (indices * 2).cast(); + let indices: u8x16 = shuffle!(indices, [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7]); + let v = u8x16::new(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); + let indices = indices + v; + unsafe { + let s: u8x16 = crate::mem::transmute(self); + crate::mem::transmute(s.shuffle1_dyn(indices)) + } + } + } + }; + (u32x4) => { + cfg_if! { + if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "avx"))] { + impl Shuffle1Dyn for u32x4 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + #[cfg(target_arch = "x86")] + use crate::arch::x86::{_mm_permutevar_ps}; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::{_mm_permutevar_ps}; + + unsafe { + crate::mem::transmute( + _mm_permutevar_ps( + crate::mem::transmute(self.0), + crate::mem::transmute(indices.0) + ) + ) + } + } + } + } else { + impl Shuffle1Dyn for u32x4 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + let indices: u8x4 = (indices * 4).cast(); + let indices: u8x16 = shuffle!( + indices, + [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3] + ); + let v = u8x16::new( + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 + ); + let indices = indices + v; + unsafe { + let s: u8x16 =crate::mem::transmute(self); + crate::mem::transmute(s.shuffle1_dyn(indices)) + } + } + } + } + } + }; + (u64x2) => { + cfg_if! { + if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "avx"))] { + impl Shuffle1Dyn for u64x2 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + #[cfg(target_arch = "x86")] + use crate::arch::x86::{_mm_permutevar_pd}; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::{_mm_permutevar_pd}; + // _mm_permutevar_pd uses the _second_ bit of each + // element to perform the selection, that is: 0b00 => 0, + // 0b10 => 1: + let indices = indices << 1; + unsafe { + crate::mem::transmute( + _mm_permutevar_pd( + crate::mem::transmute(self), + crate::mem::transmute(indices) + ) + ) + } + } + } + } else { + impl Shuffle1Dyn for u64x2 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + let indices: u8x2 = (indices * 8).cast(); + let indices: u8x16 = shuffle!( + indices, + [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + ); + let v = u8x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 + ); + let indices = indices + v; + unsafe { + let s: u8x16 =crate::mem::transmute(self); + crate::mem::transmute(s.shuffle1_dyn(indices)) + } + } + } + } + } + }; + (u128x1) => { + impl Shuffle1Dyn for u128x1 { + type Indices = Self; + #[inline] + fn shuffle1_dyn(self, _indices: Self::Indices) -> Self { + self + } + } + }; + ($id:ident) => { + impl_fallback!($id); + }; +} + +impl_shuffle1_dyn!(u8x2); +impl_shuffle1_dyn!(u8x4); +impl_shuffle1_dyn!(u8x8); +impl_shuffle1_dyn!(u8x16); +impl_shuffle1_dyn!(u8x32); +impl_shuffle1_dyn!(u8x64); + +impl_shuffle1_dyn!(u16x2); +impl_shuffle1_dyn!(u16x4); +impl_shuffle1_dyn!(u16x8); +impl_shuffle1_dyn!(u16x16); +impl_shuffle1_dyn!(u16x32); + +impl_shuffle1_dyn!(u32x2); +impl_shuffle1_dyn!(u32x4); +impl_shuffle1_dyn!(u32x8); +impl_shuffle1_dyn!(u32x16); + +impl_shuffle1_dyn!(u64x2); +impl_shuffle1_dyn!(u64x4); +impl_shuffle1_dyn!(u64x8); + +impl_shuffle1_dyn!(usizex2); +impl_shuffle1_dyn!(usizex4); +impl_shuffle1_dyn!(usizex8); + +impl_shuffle1_dyn!(u128x1); +impl_shuffle1_dyn!(u128x2); +impl_shuffle1_dyn!(u128x4); + +// Implementation for non-unsigned vector types +macro_rules! impl_shuffle1_dyn_non_u { + ($id:ident, $uid:ident) => { + impl Shuffle1Dyn for $id { + type Indices = $uid; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + unsafe { + let u: $uid = crate::mem::transmute(self); + crate::mem::transmute(u.shuffle1_dyn(indices)) + } + } + } + }; +} + +impl_shuffle1_dyn_non_u!(i8x2, u8x2); +impl_shuffle1_dyn_non_u!(i8x4, u8x4); +impl_shuffle1_dyn_non_u!(i8x8, u8x8); +impl_shuffle1_dyn_non_u!(i8x16, u8x16); +impl_shuffle1_dyn_non_u!(i8x32, u8x32); +impl_shuffle1_dyn_non_u!(i8x64, u8x64); + +impl_shuffle1_dyn_non_u!(i16x2, u16x2); +impl_shuffle1_dyn_non_u!(i16x4, u16x4); +impl_shuffle1_dyn_non_u!(i16x8, u16x8); +impl_shuffle1_dyn_non_u!(i16x16, u16x16); +impl_shuffle1_dyn_non_u!(i16x32, u16x32); + +impl_shuffle1_dyn_non_u!(i32x2, u32x2); +impl_shuffle1_dyn_non_u!(i32x4, u32x4); +impl_shuffle1_dyn_non_u!(i32x8, u32x8); +impl_shuffle1_dyn_non_u!(i32x16, u32x16); + +impl_shuffle1_dyn_non_u!(i64x2, u64x2); +impl_shuffle1_dyn_non_u!(i64x4, u64x4); +impl_shuffle1_dyn_non_u!(i64x8, u64x8); + +impl_shuffle1_dyn_non_u!(isizex2, usizex2); +impl_shuffle1_dyn_non_u!(isizex4, usizex4); +impl_shuffle1_dyn_non_u!(isizex8, usizex8); + +impl_shuffle1_dyn_non_u!(i128x1, u128x1); +impl_shuffle1_dyn_non_u!(i128x2, u128x2); +impl_shuffle1_dyn_non_u!(i128x4, u128x4); + +impl_shuffle1_dyn_non_u!(m8x2, u8x2); +impl_shuffle1_dyn_non_u!(m8x4, u8x4); +impl_shuffle1_dyn_non_u!(m8x8, u8x8); +impl_shuffle1_dyn_non_u!(m8x16, u8x16); +impl_shuffle1_dyn_non_u!(m8x32, u8x32); +impl_shuffle1_dyn_non_u!(m8x64, u8x64); + +impl_shuffle1_dyn_non_u!(m16x2, u16x2); +impl_shuffle1_dyn_non_u!(m16x4, u16x4); +impl_shuffle1_dyn_non_u!(m16x8, u16x8); +impl_shuffle1_dyn_non_u!(m16x16, u16x16); +impl_shuffle1_dyn_non_u!(m16x32, u16x32); + +impl_shuffle1_dyn_non_u!(m32x2, u32x2); +impl_shuffle1_dyn_non_u!(m32x4, u32x4); +impl_shuffle1_dyn_non_u!(m32x8, u32x8); +impl_shuffle1_dyn_non_u!(m32x16, u32x16); + +impl_shuffle1_dyn_non_u!(m64x2, u64x2); +impl_shuffle1_dyn_non_u!(m64x4, u64x4); +impl_shuffle1_dyn_non_u!(m64x8, u64x8); + +impl_shuffle1_dyn_non_u!(msizex2, usizex2); +impl_shuffle1_dyn_non_u!(msizex4, usizex4); +impl_shuffle1_dyn_non_u!(msizex8, usizex8); + +impl_shuffle1_dyn_non_u!(m128x1, u128x1); +impl_shuffle1_dyn_non_u!(m128x2, u128x2); +impl_shuffle1_dyn_non_u!(m128x4, u128x4); + +impl_shuffle1_dyn_non_u!(f32x2, u32x2); +impl_shuffle1_dyn_non_u!(f32x4, u32x4); +impl_shuffle1_dyn_non_u!(f32x8, u32x8); +impl_shuffle1_dyn_non_u!(f32x16, u32x16); + +impl_shuffle1_dyn_non_u!(f64x2, u64x2); +impl_shuffle1_dyn_non_u!(f64x4, u64x4); +impl_shuffle1_dyn_non_u!(f64x8, u64x8); + +// Implementation for non-unsigned vector types +macro_rules! impl_shuffle1_dyn_ptr { + ($id:ident, $uid:ident) => { + impl Shuffle1Dyn for $id { + type Indices = $uid; + #[inline] + fn shuffle1_dyn(self, indices: Self::Indices) -> Self { + unsafe { + let u: $uid = crate::mem::transmute(self); + crate::mem::transmute(u.shuffle1_dyn(indices)) + } + } + } + }; +} + +impl_shuffle1_dyn_ptr!(cptrx2, usizex2); +impl_shuffle1_dyn_ptr!(cptrx4, usizex4); +impl_shuffle1_dyn_ptr!(cptrx8, usizex8); + +impl_shuffle1_dyn_ptr!(mptrx2, usizex2); +impl_shuffle1_dyn_ptr!(mptrx4, usizex4); +impl_shuffle1_dyn_ptr!(mptrx8, usizex8); -- cgit v1.2.3