From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- .../packed_simd_2/src/codegen/reductions/mask.rs | 69 ++++++ .../src/codegen/reductions/mask/aarch64.rs | 71 ++++++ .../src/codegen/reductions/mask/arm.rs | 54 +++++ .../src/codegen/reductions/mask/fallback.rs | 6 + .../src/codegen/reductions/mask/fallback_impl.rs | 237 +++++++++++++++++++++ .../src/codegen/reductions/mask/x86.rs | 188 ++++++++++++++++ .../src/codegen/reductions/mask/x86/avx.rs | 101 +++++++++ .../src/codegen/reductions/mask/x86/avx2.rs | 35 +++ .../src/codegen/reductions/mask/x86/sse.rs | 36 ++++ .../src/codegen/reductions/mask/x86/sse2.rs | 70 ++++++ 10 files changed, 867 insertions(+) create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/aarch64.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/arm.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/fallback.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/fallback_impl.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/x86.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx2.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse.rs create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse2.rs (limited to 'vendor/packed_simd_2/src/codegen/reductions') diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask.rs b/vendor/packed_simd_2/src/codegen/reductions/mask.rs new file mode 100644 index 000000000..97260c6d4 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask.rs @@ -0,0 +1,69 @@ +//! Code generation workaround for `all()` mask horizontal reduction. +//! +//! Works arround [LLVM bug 36702]. +//! +//! [LLVM bug 36702]: https://bugs.llvm.org/show_bug.cgi?id=36702 +#![allow(unused_macros)] + +use crate::*; + +crate trait All: crate::marker::Sized { + unsafe fn all(self) -> bool; +} + +crate trait Any: crate::marker::Sized { + unsafe fn any(self) -> bool; +} + +#[macro_use] +mod fallback_impl; + +cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + #[macro_use] + mod x86; + } else if #[cfg(all(target_arch = "arm", target_feature = "v7", + target_feature = "neon", + any(feature = "core_arch", libcore_neon)))] { + #[macro_use] + mod arm; + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { + #[macro_use] + mod aarch64; + } else { + #[macro_use] + mod fallback; + } +} + +impl_mask_reductions!(m8x2); +impl_mask_reductions!(m8x4); +impl_mask_reductions!(m8x8); +impl_mask_reductions!(m8x16); +impl_mask_reductions!(m8x32); +impl_mask_reductions!(m8x64); + +impl_mask_reductions!(m16x2); +impl_mask_reductions!(m16x4); +impl_mask_reductions!(m16x8); +impl_mask_reductions!(m16x16); +impl_mask_reductions!(m16x32); + +impl_mask_reductions!(m32x2); +impl_mask_reductions!(m32x4); +impl_mask_reductions!(m32x8); +impl_mask_reductions!(m32x16); + +// FIXME: 64-bit single element vector +// impl_mask_reductions!(m64x1); +impl_mask_reductions!(m64x2); +impl_mask_reductions!(m64x4); +impl_mask_reductions!(m64x8); + +impl_mask_reductions!(m128x1); +impl_mask_reductions!(m128x2); +impl_mask_reductions!(m128x4); + +impl_mask_reductions!(msizex2); +impl_mask_reductions!(msizex4); +impl_mask_reductions!(msizex8); diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/aarch64.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/aarch64.rs new file mode 100644 index 000000000..e9586eace --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask/aarch64.rs @@ -0,0 +1,71 @@ +//! Mask reductions implementation for `aarch64` targets + +/// 128-bit wide vectors +macro_rules! aarch64_128_neon_impl { + ($id:ident, $vmin:ident, $vmax:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "neon")] + unsafe fn all(self) -> bool { + use crate::arch::aarch64::$vmin; + $vmin(crate::mem::transmute(self)) != 0 + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "neon")] + unsafe fn any(self) -> bool { + use crate::arch::aarch64::$vmax; + $vmax(crate::mem::transmute(self)) != 0 + } + } + } +} + +/// 64-bit wide vectors +macro_rules! aarch64_64_neon_impl { + ($id:ident, $vec128:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "neon")] + unsafe fn all(self) -> bool { + // Duplicates the 64-bit vector into a 128-bit one and + // calls all on that. + union U { + halves: ($id, $id), + vec: $vec128, + } + U { + halves: (self, self), + }.vec.all() + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "neon")] + unsafe fn any(self) -> bool { + union U { + halves: ($id, $id), + vec: $vec128, + } + U { + halves: (self, self), + }.vec.any() + } + } + }; +} + +/// Mask reduction implementation for `aarch64` targets +macro_rules! impl_mask_reductions { + // 64-bit wide masks + (m8x8) => { aarch64_64_neon_impl!(m8x8, m8x16); }; + (m16x4) => { aarch64_64_neon_impl!(m16x4, m16x8); }; + (m32x2) => { aarch64_64_neon_impl!(m32x2, m32x4); }; + // 128-bit wide masks + (m8x16) => { aarch64_128_neon_impl!(m8x16, vminvq_u8, vmaxvq_u8); }; + (m16x8) => { aarch64_128_neon_impl!(m16x8, vminvq_u16, vmaxvq_u16); }; + (m32x4) => { aarch64_128_neon_impl!(m32x4, vminvq_u32, vmaxvq_u32); }; + // Fallback to LLVM's default code-generation: + ($id:ident) => { fallback_impl!($id); }; +} diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/arm.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/arm.rs new file mode 100644 index 000000000..1987af7a9 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask/arm.rs @@ -0,0 +1,54 @@ +//! Mask reductions implementation for `arm` targets + +/// Implementation for ARM + v7 + NEON for 64-bit or 128-bit wide vectors with +/// more than two elements. +macro_rules! arm_128_v7_neon_impl { + ($id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "v7,neon")] + unsafe fn all(self) -> bool { + use crate::arch::arm::$vpmin; + use crate::mem::transmute; + union U { + halves: ($half, $half), + vec: $id, + } + let halves = U { vec: self }.halves; + let h: $half = transmute($vpmin( + transmute(halves.0), + transmute(halves.1), + )); + h.all() + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "v7,neon")] + unsafe fn any(self) -> bool { + use crate::arch::arm::$vpmax; + use crate::mem::transmute; + union U { + halves: ($half, $half), + vec: $id, + } + let halves = U { vec: self }.halves; + let h: $half = transmute($vpmax( + transmute(halves.0), + transmute(halves.1), + )); + h.any() + } + } + }; +} + +/// Mask reduction implementation for `arm` targets +macro_rules! impl_mask_reductions { + // 128-bit wide masks + (m8x16) => { arm_128_v7_neon_impl!(m8x16, m8x8, vpmin_u8, vpmax_u8); }; + (m16x8) => { arm_128_v7_neon_impl!(m16x8, m16x4, vpmin_u16, vpmax_u16); }; + (m32x4) => { arm_128_v7_neon_impl!(m32x4, m32x2, vpmin_u32, vpmax_u32); }; + // Fallback to LLVM's default code-generation: + ($id:ident) => { fallback_impl!($id); }; +} diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/fallback.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/fallback.rs new file mode 100644 index 000000000..25e5c813a --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask/fallback.rs @@ -0,0 +1,6 @@ +//! Default mask reduction implementations. + +/// Default mask reduction implementation +macro_rules! impl_mask_reductions { + ($id:ident) => { fallback_impl!($id); }; +} diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/fallback_impl.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/fallback_impl.rs new file mode 100644 index 000000000..0d246e2fd --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask/fallback_impl.rs @@ -0,0 +1,237 @@ +//! Default implementation of a mask reduction for any target. + +macro_rules! fallback_to_other_impl { + ($id:ident, $other:ident) => { + impl All for $id { + #[inline] + unsafe fn all(self) -> bool { + let m: $other = crate::mem::transmute(self); + m.all() + } + } + impl Any for $id { + #[inline] + unsafe fn any(self) -> bool { + let m: $other = crate::mem::transmute(self); + m.any() + } + } + }; +} + +/// Fallback implementation. +macro_rules! fallback_impl { + // 16-bit wide masks: + (m8x2) => { + impl All for m8x2 { + #[inline] + unsafe fn all(self) -> bool { + let i: u16 = crate::mem::transmute(self); + i == u16::max_value() + } + } + impl Any for m8x2 { + #[inline] + unsafe fn any(self) -> bool { + let i: u16 = crate::mem::transmute(self); + i != 0 + } + } + }; + // 32-bit wide masks + (m8x4) => { + impl All for m8x4 { + #[inline] + unsafe fn all(self) -> bool { + let i: u32 = crate::mem::transmute(self); + i == u32::max_value() + } + } + impl Any for m8x4 { + #[inline] + unsafe fn any(self) -> bool { + let i: u32 = crate::mem::transmute(self); + i != 0 + } + } + }; + (m16x2) => { + fallback_to_other_impl!(m16x2, m8x4); + }; + // 64-bit wide masks: + (m8x8) => { + impl All for m8x8 { + #[inline] + unsafe fn all(self) -> bool { + let i: u64 = crate::mem::transmute(self); + i == u64::max_value() + } + } + impl Any for m8x8 { + #[inline] + unsafe fn any(self) -> bool { + let i: u64 = crate::mem::transmute(self); + i != 0 + } + } + }; + (m16x4) => { + fallback_to_other_impl!(m16x4, m8x8); + }; + (m32x2) => { + fallback_to_other_impl!(m32x2, m16x4); + }; + // FIXME: 64x1 maxk + // 128-bit wide masks: + (m8x16) => { + impl All for m8x16 { + #[inline] + unsafe fn all(self) -> bool { + let i: u128 = crate::mem::transmute(self); + i == u128::max_value() + } + } + impl Any for m8x16 { + #[inline] + unsafe fn any(self) -> bool { + let i: u128 = crate::mem::transmute(self); + i != 0 + } + } + }; + (m16x8) => { + fallback_to_other_impl!(m16x8, m8x16); + }; + (m32x4) => { + fallback_to_other_impl!(m32x4, m16x8); + }; + (m64x2) => { + fallback_to_other_impl!(m64x2, m32x4); + }; + (m128x1) => { + fallback_to_other_impl!(m128x1, m64x2); + }; + // 256-bit wide masks + (m8x32) => { + impl All for m8x32 { + #[inline] + unsafe fn all(self) -> bool { + let i: [u128; 2] = crate::mem::transmute(self); + let o: [u128; 2] = [u128::max_value(); 2]; + i == o + } + } + impl Any for m8x32 { + #[inline] + unsafe fn any(self) -> bool { + let i: [u128; 2] = crate::mem::transmute(self); + let o: [u128; 2] = [0; 2]; + i != o + } + } + }; + (m16x16) => { + fallback_to_other_impl!(m16x16, m8x32); + }; + (m32x8) => { + fallback_to_other_impl!(m32x8, m16x16); + }; + (m64x4) => { + fallback_to_other_impl!(m64x4, m32x8); + }; + (m128x2) => { + fallback_to_other_impl!(m128x2, m64x4); + }; + // 512-bit wide masks + (m8x64) => { + impl All for m8x64 { + #[inline] + unsafe fn all(self) -> bool { + let i: [u128; 4] = crate::mem::transmute(self); + let o: [u128; 4] = [u128::max_value(); 4]; + i == o + } + } + impl Any for m8x64 { + #[inline] + unsafe fn any(self) -> bool { + let i: [u128; 4] = crate::mem::transmute(self); + let o: [u128; 4] = [0; 4]; + i != o + } + } + }; + (m16x32) => { + fallback_to_other_impl!(m16x32, m8x64); + }; + (m32x16) => { + fallback_to_other_impl!(m32x16, m16x32); + }; + (m64x8) => { + fallback_to_other_impl!(m64x8, m32x16); + }; + (m128x4) => { + fallback_to_other_impl!(m128x4, m64x8); + }; + // Masks with pointer-sized elements64 + (msizex2) => { + cfg_if! { + if #[cfg(target_pointer_width = "64")] { + fallback_to_other_impl!(msizex2, m64x2); + } else if #[cfg(target_pointer_width = "32")] { + fallback_to_other_impl!(msizex2, m32x2); + } else { + compile_error!("unsupported target_pointer_width"); + } + } + }; + (msizex4) => { + cfg_if! { + if #[cfg(target_pointer_width = "64")] { + fallback_to_other_impl!(msizex4, m64x4); + } else if #[cfg(target_pointer_width = "32")] { + fallback_to_other_impl!(msizex4, m32x4); + } else { + compile_error!("unsupported target_pointer_width"); + } + } + }; + (msizex8) => { + cfg_if! { + if #[cfg(target_pointer_width = "64")] { + fallback_to_other_impl!(msizex8, m64x8); + } else if #[cfg(target_pointer_width = "32")] { + fallback_to_other_impl!(msizex8, m32x8); + } else { + compile_error!("unsupported target_pointer_width"); + } + } + }; +} + +macro_rules! recurse_half { + ($vid:ident, $vid_h:ident) => { + impl All for $vid { + #[inline] + unsafe fn all(self) -> bool { + union U { + halves: ($vid_h, $vid_h), + vec: $vid, + } + let halves = U { vec: self }.halves; + halves.0.all() && halves.1.all() + } + } + impl Any for $vid { + #[inline] + unsafe fn any(self) -> bool { + union U { + halves: ($vid_h, $vid_h), + vec: $vid, + } + let halves = U { vec: self }.halves; + halves.0.any() || halves.1.any() + } + } + }; +} diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/x86.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/x86.rs new file mode 100644 index 000000000..bcfb1a6e1 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask/x86.rs @@ -0,0 +1,188 @@ +//! Mask reductions implementation for `x86` and `x86_64` targets + +#[cfg(target_feature = "sse")] +#[macro_use] +mod sse; + +#[cfg(target_feature = "sse2")] +#[macro_use] +mod sse2; + +#[cfg(target_feature = "avx")] +#[macro_use] +mod avx; + +#[cfg(target_feature = "avx2")] +#[macro_use] +mod avx2; + +/// x86 64-bit m8x8 implementation +macro_rules! x86_m8x8_impl { + ($id:ident) => { + fallback_impl!($id); + }; +} + +/// x86 128-bit m8x16 implementation +macro_rules! x86_m8x16_impl { + ($id:ident) => { + cfg_if! { + if #[cfg(target_feature = "sse2")] { + x86_m8x16_sse2_impl!($id); + } else { + fallback_impl!($id); + } + } + }; +} + +/// x86 128-bit m32x4 implementation +macro_rules! x86_m32x4_impl { + ($id:ident) => { + cfg_if! { + if #[cfg(target_feature = "sse")] { + x86_m32x4_sse_impl!($id); + } else { + fallback_impl!($id); + } + } + }; +} + +/// x86 128-bit m64x2 implementation +macro_rules! x86_m64x2_impl { + ($id:ident) => { + cfg_if! { + if #[cfg(target_feature = "sse2")] { + x86_m64x2_sse2_impl!($id); + } else if #[cfg(target_feature = "sse")] { + x86_m32x4_sse_impl!($id); + } else { + fallback_impl!($id); + } + } + }; +} + +/// x86 256-bit m8x32 implementation +macro_rules! x86_m8x32_impl { + ($id:ident, $half_id:ident) => { + cfg_if! { + if #[cfg(target_feature = "avx2")] { + x86_m8x32_avx2_impl!($id); + } else if #[cfg(target_feature = "avx")] { + x86_m8x32_avx_impl!($id); + } else if #[cfg(target_feature = "sse2")] { + recurse_half!($id, $half_id); + } else { + fallback_impl!($id); + } + } + }; +} + +/// x86 256-bit m32x8 implementation +macro_rules! x86_m32x8_impl { + ($id:ident, $half_id:ident) => { + cfg_if! { + if #[cfg(target_feature = "avx")] { + x86_m32x8_avx_impl!($id); + } else if #[cfg(target_feature = "sse")] { + recurse_half!($id, $half_id); + } else { + fallback_impl!($id); + } + } + }; +} + +/// x86 256-bit m64x4 implementation +macro_rules! x86_m64x4_impl { + ($id:ident, $half_id:ident) => { + cfg_if! { + if #[cfg(target_feature = "avx")] { + x86_m64x4_avx_impl!($id); + } else if #[cfg(target_feature = "sse")] { + recurse_half!($id, $half_id); + } else { + fallback_impl!($id); + } + } + }; +} + +/// Fallback implementation. +macro_rules! x86_intr_impl { + ($id:ident) => { + impl All for $id { + #[inline] + unsafe fn all(self) -> bool { + use crate::llvm::simd_reduce_all; + simd_reduce_all(self.0) + } + } + impl Any for $id { + #[inline] + unsafe fn any(self) -> bool { + use crate::llvm::simd_reduce_any; + simd_reduce_any(self.0) + } + } + }; +} + +/// Mask reduction implementation for `x86` and `x86_64` targets +macro_rules! impl_mask_reductions { + // 64-bit wide masks + (m8x8) => { x86_m8x8_impl!(m8x8); }; + (m16x4) => { x86_m8x8_impl!(m16x4); }; + (m32x2) => { x86_m8x8_impl!(m32x2); }; + // 128-bit wide masks + (m8x16) => { x86_m8x16_impl!(m8x16); }; + (m16x8) => { x86_m8x16_impl!(m16x8); }; + (m32x4) => { x86_m32x4_impl!(m32x4); }; + (m64x2) => { x86_m64x2_impl!(m64x2); }; + (m128x1) => { x86_intr_impl!(m128x1); }; + // 256-bit wide masks: + (m8x32) => { x86_m8x32_impl!(m8x32, m8x16); }; + (m16x16) => { x86_m8x32_impl!(m16x16, m16x8); }; + (m32x8) => { x86_m32x8_impl!(m32x8, m32x4); }; + (m64x4) => { x86_m64x4_impl!(m64x4, m64x2); }; + (m128x2) => { x86_intr_impl!(m128x2); }; + (msizex2) => { + cfg_if! { + if #[cfg(target_pointer_width = "64")] { + fallback_to_other_impl!(msizex2, m64x2); + } else if #[cfg(target_pointer_width = "32")] { + fallback_to_other_impl!(msizex2, m32x2); + } else { + compile_error!("unsupported target_pointer_width"); + } + } + }; + (msizex4) => { + cfg_if! { + if #[cfg(target_pointer_width = "64")] { + fallback_to_other_impl!(msizex4, m64x4); + } else if #[cfg(target_pointer_width = "32")] { + fallback_to_other_impl!(msizex4, m32x4); + } else { + compile_error!("unsupported target_pointer_width"); + } + } + }; + (msizex8) => { + cfg_if! { + if #[cfg(target_pointer_width = "64")] { + fallback_to_other_impl!(msizex8, m64x8); + } else if #[cfg(target_pointer_width = "32")] { + fallback_to_other_impl!(msizex8, m32x8); + } else { + compile_error!("unsupported target_pointer_width"); + } + } + }; + + // Fallback to LLVM's default code-generation: + ($id:ident) => { fallback_impl!($id); }; +} diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx.rs new file mode 100644 index 000000000..d18736fb0 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx.rs @@ -0,0 +1,101 @@ +//! Mask reductions implementation for `x86` and `x86_64` targets with `AVX` + +/// `x86`/`x86_64` 256-bit `AVX` implementation +/// FIXME: it might be faster here to do two `_mm_movmask_epi8` +#[cfg(target_feature = "avx")] +macro_rules! x86_m8x32_avx_impl { + ($id:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "avx")] + unsafe fn all(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm256_testc_si256; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm256_testc_si256; + _mm256_testc_si256( + crate::mem::transmute(self), + crate::mem::transmute($id::splat(true)), + ) != 0 + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "avx")] + unsafe fn any(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm256_testz_si256; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm256_testz_si256; + _mm256_testz_si256( + crate::mem::transmute(self), + crate::mem::transmute(self), + ) == 0 + } + } + }; +} + +/// `x86`/`x86_64` 256-bit m32x8 `AVX` implementation +macro_rules! x86_m32x8_avx_impl { + ($id:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "sse")] + unsafe fn all(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm256_movemask_ps; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm256_movemask_ps; + // _mm256_movemask_ps(a) creates a 8bit mask containing the + // most significant bit of each lane of `a`. If all bits are + // set, then all 8 lanes of the mask are true. + _mm256_movemask_ps(crate::mem::transmute(self)) == 0b_1111_1111_i32 + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "sse")] + unsafe fn any(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm256_movemask_ps; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm256_movemask_ps; + + _mm256_movemask_ps(crate::mem::transmute(self)) != 0 + } + } + }; +} + +/// `x86`/`x86_64` 256-bit m64x4 `AVX` implementation +macro_rules! x86_m64x4_avx_impl { + ($id:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "sse")] + unsafe fn all(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm256_movemask_pd; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm256_movemask_pd; + // _mm256_movemask_pd(a) creates a 4bit mask containing the + // most significant bit of each lane of `a`. If all bits are + // set, then all 4 lanes of the mask are true. + _mm256_movemask_pd(crate::mem::transmute(self)) == 0b_1111_i32 + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "sse")] + unsafe fn any(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm256_movemask_pd; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm256_movemask_pd; + + _mm256_movemask_pd(crate::mem::transmute(self)) != 0 + } + } + }; +} diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx2.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx2.rs new file mode 100644 index 000000000..d37d02342 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx2.rs @@ -0,0 +1,35 @@ +//! Mask reductions implementation for `x86` and `x86_64` targets with `AVX2`. +#![allow(unused)] + +/// x86/x86_64 256-bit m8x32 AVX2 implementation +macro_rules! x86_m8x32_avx2_impl { + ($id:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn all(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm256_movemask_epi8; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm256_movemask_epi8; + // _mm256_movemask_epi8(a) creates a 32bit mask containing the + // most significant bit of each byte of `a`. If all + // bits are set, then all 32 lanes of the mask are + // true. + _mm256_movemask_epi8(crate::mem::transmute(self)) == -1_i32 + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn any(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm256_movemask_epi8; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm256_movemask_epi8; + + _mm256_movemask_epi8(crate::mem::transmute(self)) != 0 + } + } + }; +} diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse.rs new file mode 100644 index 000000000..eb1ef7fac --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse.rs @@ -0,0 +1,36 @@ +//! Mask reductions implementation for `x86` and `x86_64` targets with `SSE`. +#![allow(unused)] + +/// `x86`/`x86_64` 128-bit `m32x4` `SSE` implementation +macro_rules! x86_m32x4_sse_impl { + ($id:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "sse")] + unsafe fn all(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm_movemask_ps; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm_movemask_ps; + // _mm_movemask_ps(a) creates a 4bit mask containing the + // most significant bit of each lane of `a`. If all + // bits are set, then all 4 lanes of the mask are + // true. + _mm_movemask_ps(crate::mem::transmute(self)) + == 0b_1111_i32 + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "sse")] + unsafe fn any(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm_movemask_ps; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm_movemask_ps; + + _mm_movemask_ps(crate::mem::transmute(self)) != 0 + } + } + }; +} diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse2.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse2.rs new file mode 100644 index 000000000..a99c606f5 --- /dev/null +++ b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse2.rs @@ -0,0 +1,70 @@ +//! Mask reductions implementation for `x86` and `x86_64` targets with `SSE2`. +#![allow(unused)] + +/// `x86`/`x86_64` 128-bit m64x2 `SSE2` implementation +macro_rules! x86_m64x2_sse2_impl { + ($id:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "sse")] + unsafe fn all(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm_movemask_pd; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm_movemask_pd; + // _mm_movemask_pd(a) creates a 2bit mask containing the + // most significant bit of each lane of `a`. If all + // bits are set, then all 2 lanes of the mask are + // true. + _mm_movemask_pd(crate::mem::transmute(self)) + == 0b_11_i32 + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "sse")] + unsafe fn any(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm_movemask_pd; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm_movemask_pd; + + _mm_movemask_pd(crate::mem::transmute(self)) != 0 + } + } + }; +} + +/// `x86`/`x86_64` 128-bit m8x16 `SSE2` implementation +macro_rules! x86_m8x16_sse2_impl { + ($id:ident) => { + impl All for $id { + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn all(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm_movemask_epi8; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm_movemask_epi8; + // _mm_movemask_epi8(a) creates a 16bit mask containing the + // most significant bit of each byte of `a`. If all + // bits are set, then all 16 lanes of the mask are + // true. + _mm_movemask_epi8(crate::mem::transmute(self)) + == i32::from(u16::max_value()) + } + } + impl Any for $id { + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn any(self) -> bool { + #[cfg(target_arch = "x86")] + use crate::arch::x86::_mm_movemask_epi8; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::_mm_movemask_epi8; + + _mm_movemask_epi8(crate::mem::transmute(self)) != 0 + } + } + }; +} -- cgit v1.2.3