summaryrefslogtreecommitdiffstats
path: root/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp')
-rw-r--r--third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp1739
1 files changed, 1739 insertions, 0 deletions
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
new file mode 100644
index 0000000000..5ec1e02d48
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
@@ -0,0 +1,1739 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
+ * Martin Renou *
+ * Copyright (c) QuantStack *
+ * Copyright (c) Serge Guelton *
+ * *
+ * Distributed under the terms of the BSD 3-Clause License. *
+ * *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX_HPP
+#define XSIMD_AVX_HPP
+
+#include <complex>
+#include <limits>
+#include <type_traits>
+
+#include "../types/xsimd_avx_register.hpp"
+
+namespace xsimd
+{
+
+ namespace kernel
+ {
+ using namespace types;
+
+ // fwd
+ template <class A, class T, size_t I>
+ inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
+
+ namespace detail
+ {
+ inline void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept
+ {
+ low = _mm256_castsi256_si128(val);
+ high = _mm256_extractf128_si256(val, 1);
+ }
+ inline void split_avx(__m256 val, __m128& low, __m128& high) noexcept
+ {
+ low = _mm256_castps256_ps128(val);
+ high = _mm256_extractf128_ps(val, 1);
+ }
+ inline void split_avx(__m256d val, __m128d& low, __m128d& high) noexcept
+ {
+ low = _mm256_castpd256_pd128(val);
+ high = _mm256_extractf128_pd(val, 1);
+ }
+ inline __m256i merge_sse(__m128i low, __m128i high) noexcept
+ {
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(low), high, 1);
+ }
+ inline __m256 merge_sse(__m128 low, __m128 high) noexcept
+ {
+ return _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1);
+ }
+ inline __m256d merge_sse(__m128d low, __m128d high) noexcept
+ {
+ return _mm256_insertf128_pd(_mm256_castpd128_pd256(low), high, 1);
+ }
+ template <class F>
+ inline __m256i fwd_to_sse(F f, __m256i self) noexcept
+ {
+ __m128i self_low, self_high;
+ split_avx(self, self_low, self_high);
+ __m128i res_low = f(self_low);
+ __m128i res_high = f(self_high);
+ return merge_sse(res_low, res_high);
+ }
+ template <class F>
+ inline __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept
+ {
+ __m128i self_low, self_high, other_low, other_high;
+ split_avx(self, self_low, self_high);
+ split_avx(other, other_low, other_high);
+ __m128i res_low = f(self_low, other_low);
+ __m128i res_high = f(self_high, other_high);
+ return merge_sse(res_low, res_high);
+ }
+ template <class F>
+ inline __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept
+ {
+ __m128i self_low, self_high;
+ split_avx(self, self_low, self_high);
+ __m128i res_low = f(self_low, other);
+ __m128i res_high = f(self_high, other);
+ return merge_sse(res_low, res_high);
+ }
+ }
+
+ // abs
+ template <class A>
+ inline batch<float, A> abs(batch<float, A> const& self, requires_arch<avx>) noexcept
+ {
+ __m256 sign_mask = _mm256_set1_ps(-0.f); // -0.f = 1 << 31
+ return _mm256_andnot_ps(sign_mask, self);
+ }
+ template <class A>
+ inline batch<double, A> abs(batch<double, A> const& self, requires_arch<avx>) noexcept
+ {
+ __m256d sign_mask = _mm256_set1_pd(-0.f); // -0.f = 1 << 31
+ return _mm256_andnot_pd(sign_mask, self);
+ }
+
+ // add
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+ { return add(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+ self, other);
+ }
+ template <class A>
+ inline batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_add_ps(self, other);
+ }
+ template <class A>
+ inline batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_add_pd(self, other);
+ }
+
+ // all
+ template <class A>
+ inline bool all(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_testc_ps(self, batch_bool<float, A>(true)) != 0;
+ }
+ template <class A>
+ inline bool all(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_testc_pd(self, batch_bool<double, A>(true)) != 0;
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline bool all(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_testc_si256(self, batch_bool<T, A>(true)) != 0;
+ }
+
+ // any
+ template <class A>
+ inline bool any(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+ {
+ return !_mm256_testz_ps(self, self);
+ }
+ template <class A>
+ inline bool any(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+ {
+ return !_mm256_testz_pd(self, self);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline bool any(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+ {
+ return !_mm256_testz_si256(self, self);
+ }
+
+ // batch_bool_cast
+ template <class A, class T_out, class T_in>
+ inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx>) noexcept
+ {
+ return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
+ }
+
+ // bitwise_and
+ template <class A>
+ inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_and_ps(self, other);
+ }
+ template <class A>
+ inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_and_pd(self, other);
+ }
+
+ template <class A>
+ inline batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_and_ps(self, other);
+ }
+ template <class A>
+ inline batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_and_pd(self, other);
+ }
+
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+ { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+ self, other);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+ { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+ self, other);
+ }
+
+ // bitwise_andnot
+ template <class A>
+ inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_andnot_ps(other, self);
+ }
+ template <class A>
+ inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_andnot_pd(other, self);
+ }
+
+ template <class A>
+ inline batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_andnot_ps(other, self);
+ }
+ template <class A>
+ inline batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_andnot_pd(other, self);
+ }
+
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+ { return bitwise_andnot(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+ self, other);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+ { return bitwise_andnot(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+ self, other);
+ }
+
+ // bitwise_lshift
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
+ {
+ return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept
+ { return bitwise_lshift(batch<T, sse4_2>(s), o, sse4_2 {}); },
+ self, other);
+ }
+
+ // bitwise_not
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx>) noexcept
+ {
+ return detail::fwd_to_sse([](__m128i s) noexcept
+ { return bitwise_not(batch<T, sse4_2>(s), sse4_2 {}); },
+ self);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+ {
+ return detail::fwd_to_sse([](__m128i s) noexcept
+ { return bitwise_not(batch_bool<T, sse4_2>(s), sse4_2 {}); },
+ self);
+ }
+
+ // bitwise_or
+ template <class A>
+ inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_or_ps(self, other);
+ }
+ template <class A>
+ inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_or_pd(self, other);
+ }
+ template <class A>
+ inline batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_or_ps(self, other);
+ }
+ template <class A>
+ inline batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_or_pd(self, other);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+ { return bitwise_or(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+ self, other);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+ { return bitwise_or(batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o)); },
+ self, other);
+ }
+
+ // bitwise_rshift
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
+ {
+ return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept
+ { return bitwise_rshift(batch<T, sse4_2>(s), o, sse4_2 {}); },
+ self, other);
+ }
+
+ // bitwise_xor
+ template <class A>
+ inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_xor_ps(self, other);
+ }
+ template <class A>
+ inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_xor_pd(self, other);
+ }
+ template <class A>
+ inline batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_xor_ps(self, other);
+ }
+ template <class A>
+ inline batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_xor_pd(self, other);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+ { return bitwise_xor(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
+ self, other);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+ { return bitwise_xor(batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o), sse4_2 {}); },
+ self, other);
+ }
+
+ // bitwise_cast
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
+ {
+ return _mm256_castsi256_ps(self);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
+ {
+ return _mm256_castsi256_pd(self);
+ }
+ template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
+ inline batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<avx>) noexcept
+ {
+ return batch<Tp, A>(self.data);
+ }
+ template <class A>
+ inline batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
+ {
+ return _mm256_castps_pd(self);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
+ {
+ return _mm256_castps_si256(self);
+ }
+ template <class A>
+ inline batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
+ {
+ return _mm256_castpd_ps(self);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
+ {
+ return _mm256_castpd_si256(self);
+ }
+
+ // bitwise_not
+ template <class A>
+ inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
+ }
+ template <class A>
+ inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
+ }
+ template <class A>
+ inline batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
+ }
+ template <class A>
+ inline batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
+ }
+
+ // broadcast
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> broadcast(T val, requires_arch<avx>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm256_set1_epi8(val);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_set1_epi16(val);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_set1_epi32(val);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ return _mm256_set1_epi64x(val);
+ }
+ else
+ {
+ assert(false && "unsupported");
+ return {};
+ }
+ }
+ template <class A>
+ inline batch<float, A> broadcast(float val, requires_arch<avx>) noexcept
+ {
+ return _mm256_set1_ps(val);
+ }
+ template <class A>
+ inline batch<double, A> broadcast(double val, requires_arch<avx>) noexcept
+ {
+ return _mm256_set1_pd(val);
+ }
+
+ // ceil
+ template <class A>
+ inline batch<float, A> ceil(batch<float, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_ceil_ps(self);
+ }
+ template <class A>
+ inline batch<double, A> ceil(batch<double, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_ceil_pd(self);
+ }
+
+ namespace detail
+ {
+ // On clang, _mm256_extractf128_ps is built upon build_shufflevector
+ // which require index parameter to be a constant
+ template <int index, class B>
+ inline B get_half_complex_f(const B& real, const B& imag) noexcept
+ {
+ __m128 tmp0 = _mm256_extractf128_ps(real, index);
+ __m128 tmp1 = _mm256_extractf128_ps(imag, index);
+ __m128 tmp2 = _mm_unpackhi_ps(tmp0, tmp1);
+ tmp0 = _mm_unpacklo_ps(tmp0, tmp1);
+ __m256 res = real;
+ res = _mm256_insertf128_ps(res, tmp0, 0);
+ res = _mm256_insertf128_ps(res, tmp2, 1);
+ return res;
+ }
+ template <int index, class B>
+ inline B get_half_complex_d(const B& real, const B& imag) noexcept
+ {
+ __m128d tmp0 = _mm256_extractf128_pd(real, index);
+ __m128d tmp1 = _mm256_extractf128_pd(imag, index);
+ __m128d tmp2 = _mm_unpackhi_pd(tmp0, tmp1);
+ tmp0 = _mm_unpacklo_pd(tmp0, tmp1);
+ __m256d res = real;
+ res = _mm256_insertf128_pd(res, tmp0, 0);
+ res = _mm256_insertf128_pd(res, tmp2, 1);
+ return res;
+ }
+
+ // complex_low
+ template <class A>
+ inline batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
+ {
+ return get_half_complex_f<0>(self.real(), self.imag());
+ }
+ template <class A>
+ inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
+ {
+ return get_half_complex_d<0>(self.real(), self.imag());
+ }
+
+ // complex_high
+ template <class A>
+ inline batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
+ {
+ return get_half_complex_f<1>(self.real(), self.imag());
+ }
+ template <class A>
+ inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
+ {
+ return get_half_complex_d<1>(self.real(), self.imag());
+ }
+ }
+
+ // fast_cast
+ namespace detail
+ {
+ template <class A>
+ inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
+ {
+ return _mm256_cvtepi32_ps(self);
+ }
+
+ template <class A>
+ inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx>) noexcept
+ {
+ return _mm256_cvttps_epi32(self);
+ }
+ }
+
+ // decr_if
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx>) noexcept
+ {
+ return self + batch<T, A>(mask.data);
+ }
+
+ // div
+ template <class A>
+ inline batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_div_ps(self, other);
+ }
+ template <class A>
+ inline batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_div_pd(self, other);
+ }
+
+ // eq
+ template <class A>
+ inline batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_cmp_ps(self, other, _CMP_EQ_OQ);
+ }
+ template <class A>
+ inline batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_cmp_pd(self, other, _CMP_EQ_OQ);
+ }
+ template <class A>
+ inline batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return ~(self != other);
+ }
+ template <class A>
+ inline batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return ~(self != other);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+ { return eq(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
+ self, other);
+ }
+
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ return ~(self != other);
+ }
+
+ // floor
+ template <class A>
+ inline batch<float, A> floor(batch<float, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_floor_ps(self);
+ }
+ template <class A>
+ inline batch<double, A> floor(batch<double, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_floor_pd(self);
+ }
+
+ // from_mask
+ template <class A>
+ inline batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<avx>) noexcept
+ {
+ alignas(A::alignment()) static const uint64_t lut32[] = {
+ 0x0000000000000000ul,
+ 0x00000000FFFFFFFFul,
+ 0xFFFFFFFF00000000ul,
+ 0xFFFFFFFFFFFFFFFFul,
+ };
+ assert(!(mask & ~0xFFul) && "inbound mask");
+ return _mm256_castsi256_ps(_mm256_setr_epi64x(lut32[mask & 0x3], lut32[(mask >> 2) & 0x3], lut32[(mask >> 4) & 0x3], lut32[mask >> 6]));
+ }
+ template <class A>
+ inline batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<avx>) noexcept
+ {
+ alignas(A::alignment()) static const uint64_t lut64[][4] = {
+ { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul },
+ { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul },
+ { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul },
+ { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul },
+ { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+ { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+ { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+ { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+ { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+ { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+ { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+ { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+ { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+ { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+ { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+ { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+ };
+ assert(!(mask & ~0xFul) && "inbound mask");
+ return _mm256_castsi256_pd(_mm256_load_si256((const __m256i*)lut64[mask]));
+ }
+ template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx>) noexcept
+ {
+ alignas(A::alignment()) static const uint32_t lut32[] = {
+ 0x00000000,
+ 0x000000FF,
+ 0x0000FF00,
+ 0x0000FFFF,
+ 0x00FF0000,
+ 0x00FF00FF,
+ 0x00FFFF00,
+ 0x00FFFFFF,
+ 0xFF000000,
+ 0xFF0000FF,
+ 0xFF00FF00,
+ 0xFF00FFFF,
+ 0xFFFF0000,
+ 0xFFFF00FF,
+ 0xFFFFFF00,
+ 0xFFFFFFFF,
+ };
+ alignas(A::alignment()) static const uint64_t lut64[] = {
+ 0x0000000000000000ul,
+ 0x000000000000FFFFul,
+ 0x00000000FFFF0000ul,
+ 0x00000000FFFFFFFFul,
+ 0x0000FFFF00000000ul,
+ 0x0000FFFF0000FFFFul,
+ 0x0000FFFFFFFF0000ul,
+ 0x0000FFFFFFFFFFFFul,
+ 0xFFFF000000000000ul,
+ 0xFFFF00000000FFFFul,
+ 0xFFFF0000FFFF0000ul,
+ 0xFFFF0000FFFFFFFFul,
+ 0xFFFFFFFF00000000ul,
+ 0xFFFFFFFF0000FFFFul,
+ 0xFFFFFFFFFFFF0000ul,
+ 0xFFFFFFFFFFFFFFFFul,
+ };
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ assert(!(mask & ~0xFFFFFFFFul) && "inbound mask");
+ return _mm256_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF],
+ lut32[(mask >> 8) & 0xF], lut32[(mask >> 12) & 0xF],
+ lut32[(mask >> 16) & 0xF], lut32[(mask >> 20) & 0xF],
+ lut32[(mask >> 24) & 0xF], lut32[mask >> 28]);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ assert(!(mask & ~0xFFFFul) && "inbound mask");
+ return _mm256_setr_epi64x(lut64[mask & 0xF], lut64[(mask >> 4) & 0xF], lut64[(mask >> 8) & 0xF], lut64[(mask >> 12) & 0xF]);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_castps_si256(from_mask(batch_bool<float, A> {}, mask, avx {}));
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ return _mm256_castpd_si256(from_mask(batch_bool<double, A> {}, mask, avx {}));
+ }
+ }
+
+ // haddp
+ template <class A>
+ inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx>) noexcept
+ {
+ // row = (a,b,c,d,e,f,g,h)
+ // tmp0 = (a0+a1, a2+a3, b0+b1, b2+b3, a4+a5, a6+a7, b4+b5, b6+b7)
+ __m256 tmp0 = _mm256_hadd_ps(row[0], row[1]);
+ // tmp1 = (c0+c1, c2+c3, d1+d2, d2+d3, c4+c5, c6+c7, d4+d5, d6+d7)
+ __m256 tmp1 = _mm256_hadd_ps(row[2], row[3]);
+ // tmp1 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3,
+ // a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7)
+ tmp1 = _mm256_hadd_ps(tmp0, tmp1);
+ // tmp0 = (e0+e1, e2+e3, f0+f1, f2+f3, e4+e5, e6+e7, f4+f5, f6+f7)
+ tmp0 = _mm256_hadd_ps(row[4], row[5]);
+ // tmp2 = (g0+g1, g2+g3, h0+h1, h2+h3, g4+g5, g6+g7, h4+h5, h6+h7)
+ __m256 tmp2 = _mm256_hadd_ps(row[6], row[7]);
+ // tmp2 = (e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3,
+ // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7)
+ tmp2 = _mm256_hadd_ps(tmp0, tmp2);
+ // tmp0 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3,
+ // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7)
+ tmp0 = _mm256_blend_ps(tmp1, tmp2, 0b11110000);
+ // tmp1 = (a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7,
+ // e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3)
+ tmp1 = _mm256_permute2f128_ps(tmp1, tmp2, 0x21);
+ return _mm256_add_ps(tmp0, tmp1);
+ }
+ template <class A>
+ inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<avx>) noexcept
+ {
+ // row = (a,b,c,d)
+ // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3)
+ __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]);
+ // tmp1 = (c0+c1, d0+d1, c2+c3, d2+d3)
+ __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]);
+ // tmp2 = (a0+a1, b0+b1, c2+c3, d2+d3)
+ __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100);
+ // tmp1 = (a2+a3, b2+b3, c2+c3, d2+d3)
+ tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21);
+ return _mm256_add_pd(tmp1, tmp2);
+ }
+
+ // incr_if
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx>) noexcept
+ {
+ return self - batch<T, A>(mask.data);
+ }
+
+ // insert
+ template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx>) noexcept
+ {
+#if !defined(_MSC_VER) || _MSC_VER > 1900
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm256_insert_epi8(self, val, I);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_insert_epi16(self, val, I);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_insert_epi32(self, val, I);
+ }
+ else
+ {
+ return insert(self, val, pos, generic {});
+ }
+#endif
+ return insert(self, val, pos, generic {});
+ }
+
+ // isnan
+ template <class A>
+ inline batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_cmp_ps(self, self, _CMP_UNORD_Q);
+ }
+ template <class A>
+ inline batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_cmp_pd(self, self, _CMP_UNORD_Q);
+ }
+
+ // le
+ template <class A>
+ inline batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_cmp_ps(self, other, _CMP_LE_OQ);
+ }
+ template <class A>
+ inline batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_cmp_pd(self, other, _CMP_LE_OQ);
+ }
+
+ // load_aligned
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
+ {
+ return _mm256_load_si256((__m256i const*)mem);
+ }
+ template <class A>
+ inline batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
+ {
+ return _mm256_load_ps(mem);
+ }
+ template <class A>
+ inline batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
+ {
+ return _mm256_load_pd(mem);
+ }
+
+ namespace detail
+ {
+ // load_complex
+ template <class A>
+ inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx>) noexcept
+ {
+ using batch_type = batch<float, A>;
+ __m128 tmp0 = _mm256_extractf128_ps(hi, 0);
+ __m128 tmp1 = _mm256_extractf128_ps(hi, 1);
+ __m128 tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0));
+ __m128 tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+ batch_type real = _mm256_castps128_ps256(tmp_real);
+ batch_type imag = _mm256_castps128_ps256(tmp_imag);
+
+ tmp0 = _mm256_extractf128_ps(lo, 0);
+ tmp1 = _mm256_extractf128_ps(lo, 1);
+ tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0));
+ tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+ real = _mm256_insertf128_ps(real, tmp_real, 1);
+ imag = _mm256_insertf128_ps(imag, tmp_imag, 1);
+ return { real, imag };
+ }
+ template <class A>
+ inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx>) noexcept
+ {
+ using batch_type = batch<double, A>;
+ __m128d tmp0 = _mm256_extractf128_pd(hi, 0);
+ __m128d tmp1 = _mm256_extractf128_pd(hi, 1);
+ batch_type real = _mm256_castpd128_pd256(_mm_unpacklo_pd(tmp0, tmp1));
+ batch_type imag = _mm256_castpd128_pd256(_mm_unpackhi_pd(tmp0, tmp1));
+
+ tmp0 = _mm256_extractf128_pd(lo, 0);
+ tmp1 = _mm256_extractf128_pd(lo, 1);
+ __m256d re_tmp1 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 1);
+ __m256d im_tmp1 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 1);
+ real = _mm256_blend_pd(real, re_tmp1, 12);
+ imag = _mm256_blend_pd(imag, im_tmp1, 12);
+ return { real, imag };
+ }
+ }
+
+ // load_unaligned
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
+ {
+ return _mm256_loadu_si256((__m256i const*)mem);
+ }
+ template <class A>
+ inline batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
+ {
+ return _mm256_loadu_ps(mem);
+ }
+ template <class A>
+ inline batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
+ {
+ return _mm256_loadu_pd(mem);
+ }
+
+ // lt
+ template <class A>
+ inline batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_cmp_ps(self, other, _CMP_LT_OQ);
+ }
+ template <class A>
+ inline batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_cmp_pd(self, other, _CMP_LT_OQ);
+ }
+
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+ { return lt(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+ self, other);
+ }
+
+ // mask
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
+ {
+ __m128i self_low, self_high;
+ detail::split_avx(self, self_low, self_high);
+ return mask(batch_bool<T, sse4_2>(self_low), sse4_2 {}) | (mask(batch_bool<T, sse4_2>(self_high), sse4_2 {}) << (128 / (8 * sizeof(T))));
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_movemask_ps(_mm256_castsi256_ps(self));
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ return _mm256_movemask_pd(_mm256_castsi256_pd(self));
+ }
+ else
+ {
+ assert(false && "unsupported arch/op combination");
+ return {};
+ }
+ }
+ template <class A>
+ inline uint64_t mask(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_movemask_ps(self);
+ }
+
+ template <class A>
+ inline uint64_t mask(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_movemask_pd(self);
+ }
+
+ // max
+ template <class A>
+ inline batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_max_ps(self, other);
+ }
+ template <class A>
+ inline batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_max_pd(self, other);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ return select(self > other, self, other);
+ }
+
+ // min
+ template <class A>
+ inline batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_min_ps(self, other);
+ }
+ template <class A>
+ inline batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_min_pd(self, other);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ return select(self <= other, self, other);
+ }
+
+ // mul
+ template <class A>
+ inline batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_mul_ps(self, other);
+ }
+ template <class A>
+ inline batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_mul_pd(self, other);
+ }
+
+ // nearbyint
+ template <class A>
+ inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
+ }
+ template <class A>
+ inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
+ }
+
+ // nearbyint_as_int
+ template <class A>
+ inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+ requires_arch<avx>) noexcept
+ {
+ return _mm256_cvtps_epi32(self);
+ }
+
+ // neg
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> neg(batch<T, A> const& self, requires_arch<avx>) noexcept
+ {
+ return 0 - self;
+ }
+ template <class A>
+ batch<float, A> neg(batch<float, A> const& self, requires_arch<avx>)
+ {
+ return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));
+ }
+ template <class A>
+ inline batch<double, A> neg(batch<double, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000)));
+ }
+
+ // neq
+ template <class A>
+ inline batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_cmp_ps(self, other, _CMP_NEQ_UQ);
+ }
+ template <class A>
+ inline batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_cmp_pd(self, other, _CMP_NEQ_UQ);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ return ~(self == other);
+ }
+
+ template <class A>
+ inline batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_xor_ps(self, other);
+ }
+ template <class A>
+ inline batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_xor_pd(self, other);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(self.data), _mm256_castsi256_ps(other.data)));
+ }
+
+ // reciprocal
+ template <class A>
+ inline batch<float, A> reciprocal(batch<float, A> const& self,
+ kernel::requires_arch<avx>) noexcept
+ {
+ return _mm256_rcp_ps(self);
+ }
+
+ // reduce_add
+ template <class A>
+ inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx>) noexcept
+ {
+ // Warning about _mm256_hadd_ps:
+ // _mm256_hadd_ps(a,b) gives
+ // (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't
+ // rely on a naive use of this method
+ // rhs = (x0, x1, x2, x3, x4, x5, x6, x7)
+ // tmp = (x4, x5, x6, x7, x0, x1, x2, x3)
+ __m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1);
+ // tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7)
+ tmp = _mm256_add_ps(rhs, tmp);
+ // tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -)
+ tmp = _mm256_hadd_ps(tmp, tmp);
+ // tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -)
+ tmp = _mm256_hadd_ps(tmp, tmp);
+ return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0));
+ }
+ template <class A>
+ inline double reduce_add(batch<double, A> const& rhs, requires_arch<avx>) noexcept
+ {
+ // rhs = (x0, x1, x2, x3)
+ // tmp = (x2, x3, x0, x1)
+ __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1);
+ // tmp = (x2+x0, x3+x1, -, -)
+ tmp = _mm256_add_pd(rhs, tmp);
+ // tmp = (x2+x0+x3+x1, -, -, -)
+ tmp = _mm256_hadd_pd(tmp, tmp);
+ return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0));
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
+ {
+ __m128i low, high;
+ detail::split_avx(self, low, high);
+ batch<T, sse4_2> blow(low), bhigh(high);
+ return reduce_add(blow) + reduce_add(bhigh);
+ }
+
+ // reduce_max
+ template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+ inline T reduce_max(batch<T, A> const& self, requires_arch<avx>) noexcept
+ {
+ constexpr auto mask = detail::shuffle(1, 0);
+ batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
+ batch<T, A> acc = max(self, step);
+ __m128i low = _mm256_castsi256_si128(acc);
+ return reduce_max(batch<T, sse4_2>(low));
+ }
+
+ // reduce_min
+ template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+ inline T reduce_min(batch<T, A> const& self, requires_arch<avx>) noexcept
+ {
+ constexpr auto mask = detail::shuffle(1, 0);
+ batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
+ batch<T, A> acc = min(self, step);
+ __m128i low = _mm256_castsi256_si128(acc);
+ return reduce_min(batch<T, sse4_2>(low));
+ }
+
+ // rsqrt
+ template <class A>
+ inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
+ {
+ return _mm256_rsqrt_ps(val);
+ }
+ template <class A>
+ inline batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
+ {
+ return _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(val)));
+ }
+
+ // sadd
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ if (std::is_signed<T>::value)
+ {
+ auto mask = (other >> (8 * sizeof(T) - 1));
+ auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
+ auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
+ return other + select(batch_bool<T, A>(mask.data), self_neg_branch, self_pos_branch);
+ }
+ else
+ {
+ const auto diffmax = std::numeric_limits<T>::max() - self;
+ const auto mindiff = min(diffmax, other);
+ return self + mindiff;
+ }
+ }
+
+ // select
+ template <class A>
+ inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
+ {
+ return _mm256_blendv_ps(false_br, true_br, cond);
+ }
+ template <class A>
+ inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
+ {
+ return _mm256_blendv_pd(false_br, true_br, cond);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
+ {
+ __m128i cond_low, cond_hi;
+ detail::split_avx(cond, cond_low, cond_hi);
+
+ __m128i true_low, true_hi;
+ detail::split_avx(true_br, true_low, true_hi);
+
+ __m128i false_low, false_hi;
+ detail::split_avx(false_br, false_low, false_hi);
+
+ __m128i res_low = select(batch_bool<T, sse4_2>(cond_low), batch<T, sse4_2>(true_low), batch<T, sse4_2>(false_low), sse4_2 {});
+ __m128i res_hi = select(batch_bool<T, sse4_2>(cond_hi), batch<T, sse4_2>(true_hi), batch<T, sse4_2>(false_hi), sse4_2 {});
+ return detail::merge_sse(res_low, res_hi);
+ }
+ template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
+ {
+ return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
+ }
+
+ template <class A, bool... Values>
+ inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
+ {
+ constexpr auto mask = batch_bool_constant<batch<float, A>, Values...>::mask();
+ return _mm256_blend_ps(false_br, true_br, mask);
+ }
+
+ template <class A, bool... Values>
+ inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
+ {
+ constexpr auto mask = batch_bool_constant<batch<double, A>, Values...>::mask();
+ return _mm256_blend_pd(false_br, true_br, mask);
+ }
+
+ // set
+ template <class A, class... Values>
+ inline batch<float, A> set(batch<float, A> const&, requires_arch<avx>, Values... values) noexcept
+ {
+ static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
+ return _mm256_setr_ps(values...);
+ }
+
+ template <class A, class... Values>
+ inline batch<double, A> set(batch<double, A> const&, requires_arch<avx>, Values... values) noexcept
+ {
+ static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
+ return _mm256_setr_pd(values...);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3) noexcept
+ {
+ return _mm256_set_epi64x(v3, v2, v1, v0);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+ {
+ return _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+ {
+ return _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+ T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
+ {
+ return _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+ }
+
+ template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx>, Values... values) noexcept
+ {
+ return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
+ }
+
+ template <class A, class... Values>
+ inline batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<avx>, Values... values) noexcept
+ {
+ static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
+ return _mm256_castsi256_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
+ }
+
+ template <class A, class... Values>
+ inline batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<avx>, Values... values) noexcept
+ {
+ static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
+ return _mm256_castsi256_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
+ }
+
+ // shuffle
+ template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
+ inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
+ {
+ constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
+ // shuffle within lane
+ if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I0 < 4 && I1 < 4 && I2 >= 8 && I2 < 12 && I3 >= 8 && I3 < 12)
+ return _mm256_shuffle_ps(x, y, smask);
+
+ // shuffle within opposite lane
+ if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I2 < 4 && I3 < 4 && I0 >= 8 && I0 < 12 && I1 >= 8 && I1 < 12)
+ return _mm256_shuffle_ps(y, x, smask);
+
+ return shuffle(x, y, mask, generic {});
+ }
+
+ template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
+ inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3> mask, requires_arch<avx>) noexcept
+ {
+ constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3);
+ // shuffle within lane
+ if (I0 < 2 && I1 >= 4 && I1 < 6 && I2 >= 2 && I2 < 4 && I3 >= 6)
+ return _mm256_shuffle_pd(x, y, smask);
+
+ // shuffle within opposite lane
+ if (I1 < 2 && I0 >= 4 && I0 < 6 && I3 >= 2 && I3 < 4 && I2 >= 6)
+ return _mm256_shuffle_pd(y, x, smask);
+
+ return shuffle(x, y, mask, generic {});
+ }
+
+ // slide_left
+ template <size_t N, class A, class T>
+ inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx>) noexcept
+ {
+ constexpr unsigned BitCount = N * 8;
+ if (BitCount == 0)
+ {
+ return x;
+ }
+ if (BitCount >= 256)
+ {
+ return batch<T, A>(T(0));
+ }
+ if (BitCount > 128)
+ {
+ constexpr unsigned M = (BitCount - 128) / 8;
+ __m128i low = _mm256_castsi256_si128(x);
+ auto y = _mm_slli_si128(low, M);
+ __m256i zero = _mm256_setzero_si256();
+ return _mm256_insertf128_si256(zero, y, 1);
+ }
+ if (BitCount == 128)
+ {
+ __m128i low = _mm256_castsi256_si128(x);
+ __m256i zero = _mm256_setzero_si256();
+ return _mm256_insertf128_si256(zero, low, 1);
+ }
+ // shifting by [0, 128[ bits
+ constexpr unsigned M = BitCount / 8;
+
+ __m128i low = _mm256_castsi256_si128(x);
+ auto ylow = _mm_slli_si128(low, M);
+ auto zlow = _mm_srli_si128(low, 16 - M);
+
+ __m128i high = _mm256_extractf128_si256(x, 1);
+ auto yhigh = _mm_slli_si128(high, M);
+
+ __m256i res = _mm256_castsi128_si256(ylow);
+ return _mm256_insertf128_si256(res, _mm_or_si128(yhigh, zlow), 1);
+ }
+
+ // slide_right
+ template <size_t N, class A, class T>
+ inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx>) noexcept
+ {
+ constexpr unsigned BitCount = N * 8;
+ if (BitCount == 0)
+ {
+ return x;
+ }
+ if (BitCount >= 256)
+ {
+ return batch<T, A>(T(0));
+ }
+ if (BitCount > 128)
+ {
+ constexpr unsigned M = (BitCount - 128) / 8;
+ __m128i high = _mm256_extractf128_si256(x, 1);
+ __m128i y = _mm_srli_si128(high, M);
+ __m256i zero = _mm256_setzero_si256();
+ return _mm256_insertf128_si256(zero, y, 0);
+ }
+ if (BitCount == 128)
+ {
+ __m128i high = _mm256_extractf128_si256(x, 1);
+ return _mm256_castsi128_si256(high);
+ }
+ // shifting by [0, 128[ bits
+ constexpr unsigned M = BitCount / 8;
+
+ __m128i low = _mm256_castsi256_si128(x);
+ auto ylow = _mm_srli_si128(low, M);
+
+ __m128i high = _mm256_extractf128_si256(x, 1);
+ auto yhigh = _mm_srli_si128(high, M);
+ auto zhigh = _mm_slli_si128(high, 16 - M);
+
+ __m256i res = _mm256_castsi128_si256(_mm_or_si128(ylow, zhigh));
+ return _mm256_insertf128_si256(res, yhigh, 1);
+ }
+
+ // sqrt
+ template <class A>
+ inline batch<float, A> sqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
+ {
+ return _mm256_sqrt_ps(val);
+ }
+ template <class A>
+ inline batch<double, A> sqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
+ {
+ return _mm256_sqrt_pd(val);
+ }
+
+ // ssub
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ if (std::is_signed<T>::value)
+ {
+ return sadd(self, -other);
+ }
+ else
+ {
+ const auto diff = min(self, other);
+ return self - diff;
+ }
+ }
+
+ // store_aligned
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline void store_aligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_store_si256((__m256i*)mem, self);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_store_si256((__m256i*)mem, self);
+ }
+ template <class A>
+ inline void store_aligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_store_ps(mem, self);
+ }
+ template <class A>
+ inline void store_aligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_store_pd(mem, self);
+ }
+
+ // store_unaligned
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_storeu_si256((__m256i*)mem, self);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_storeu_si256((__m256i*)mem, self);
+ }
+ template <class A>
+ inline void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_storeu_ps(mem, self);
+ }
+ template <class A>
+ inline void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_storeu_pd(mem, self);
+ }
+
+ // sub
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+ { return sub(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+ self, other);
+ }
+ template <class A>
+ inline batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_sub_ps(self, other);
+ }
+ template <class A>
+ inline batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ return _mm256_sub_pd(self, other);
+ }
+
+ // swizzle (dynamic mask)
+ template <class A>
+ inline batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx>) noexcept
+ {
+ // duplicate low and high part of input
+ __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
+ __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0);
+
+ __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self));
+ __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
+
+ // normalize mask
+ batch<uint32_t, A> half_mask = mask % 4;
+
+ // permute within each lane
+ __m256 r0 = _mm256_permutevar_ps(low_low, half_mask);
+ __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask);
+
+ // mask to choose the right lane
+ batch_bool<uint32_t, A> blend_mask = mask >= 4;
+
+ // blend the two permutes
+ return _mm256_blendv_ps(r0, r1, batch_bool_cast<float>(blend_mask));
+ }
+
+ template <class A>
+ inline batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx>) noexcept
+ {
+ // duplicate low and high part of input
+ __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
+ __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0);
+
+ __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self));
+ __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
+
+ // normalize mask
+ batch<uint64_t, A> half_mask = -(mask & 1);
+
+ // permute within each lane
+ __m256d r0 = _mm256_permutevar_pd(low_low, half_mask);
+ __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask);
+
+ // mask to choose the right lane
+ batch_bool<uint64_t, A> blend_mask = mask >= 2;
+
+ // blend the two permutes
+ return _mm256_blendv_pd(r0, r1, batch_bool_cast<double>(blend_mask));
+ }
+
+ template <class A, typename T, detail::enable_sized_integral_t<T, 4> = 0>
+ inline batch<T, A> swizzle(batch<T, A> const& self, batch<uint32_t, A> const& mask, requires_arch<avx>) noexcept
+ {
+ return bitwise_cast<T>(
+ swizzle(bitwise_cast<float>(self), mask));
+ }
+
+ template <class A, typename T, detail::enable_sized_integral_t<T, 8> = 0>
+ inline batch<T, A>
+ swizzle(batch<T, A> const& self, batch<uint64_t, A> const& mask, requires_arch<avx>) noexcept
+ {
+ return bitwise_cast<T>(
+ swizzle(bitwise_cast<double>(self), mask));
+ }
+
+ // swizzle (constant mask)
+ template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+ inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
+ {
+ // duplicate low and high part of input
+ __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
+ __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0);
+
+ __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self));
+ __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
+
+ // normalize mask
+ batch_constant<batch<uint32_t, A>, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
+
+ // permute within each lane
+ __m256 r0 = _mm256_permutevar_ps(low_low, (batch<uint32_t, A>)half_mask);
+ __m256 r1 = _mm256_permutevar_ps(hi_hi, (batch<uint32_t, A>)half_mask);
+
+ // mask to choose the right lane
+ batch_bool_constant<batch<uint32_t, A>, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
+
+ // blend the two permutes
+ constexpr auto mask = blend_mask.mask();
+ return _mm256_blend_ps(r0, r1, mask);
+ }
+
+ template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+ inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx>) noexcept
+ {
+ // duplicate low and high part of input
+ __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
+ __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0);
+
+ __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self));
+ __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
+
+ // normalize mask
+ batch_constant<batch<uint64_t, A>, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
+
+ // permute within each lane
+ __m256d r0 = _mm256_permutevar_pd(low_low, (batch<uint64_t, A>)half_mask);
+ __m256d r1 = _mm256_permutevar_pd(hi_hi, (batch<uint64_t, A>)half_mask);
+
+ // mask to choose the right lane
+ batch_bool_constant<batch<uint64_t, A>, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
+
+ // blend the two permutes
+ constexpr auto mask = blend_mask.mask();
+ return _mm256_blend_pd(r0, r1, mask);
+ }
+ template <class A,
+ typename T,
+ uint32_t V0,
+ uint32_t V1,
+ uint32_t V2,
+ uint32_t V3,
+ uint32_t V4,
+ uint32_t V5,
+ uint32_t V6,
+ uint32_t V7,
+ detail::enable_sized_integral_t<T, 4> = 0>
+ inline batch<T, A> swizzle(batch<T, A> const& self,
+ batch_constant<batch<uint32_t, A>,
+ V0,
+ V1,
+ V2,
+ V3,
+ V4,
+ V5,
+ V6,
+ V7> const& mask,
+ requires_arch<avx>) noexcept
+ {
+ return bitwise_cast<T>(
+ swizzle(bitwise_cast<float>(self), mask));
+ }
+
+ template <class A,
+ typename T,
+ uint64_t V0,
+ uint64_t V1,
+ uint64_t V2,
+ uint64_t V3,
+ detail::enable_sized_integral_t<T, 8> = 0>
+ inline batch<T, A>
+ swizzle(batch<T, A> const& self,
+ batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> const& mask,
+ requires_arch<avx>) noexcept
+ {
+ return bitwise_cast<T>(
+ swizzle(bitwise_cast<double>(self), mask));
+ }
+
+ // trunc
+ template <class A>
+ inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_round_ps(self, _MM_FROUND_TO_ZERO);
+ }
+ template <class A>
+ inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<avx>) noexcept
+ {
+ return _mm256_round_pd(self, _MM_FROUND_TO_ZERO);
+ }
+
+ // zip_hi
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
+ {
+ // extract high word
+ __m128i self_hi = _mm256_extractf128_si256(self, 1);
+ __m128i other_hi = _mm256_extractf128_si256(other, 1);
+
+ // interleave
+ __m128i res_lo, res_hi;
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ res_lo = _mm_unpacklo_epi8(self_hi, other_hi);
+ res_hi = _mm_unpackhi_epi8(self_hi, other_hi);
+ }
+ else
+ {
+ res_lo = _mm_unpacklo_epi16(self_hi, other_hi);
+ res_hi = _mm_unpackhi_epi16(self_hi, other_hi);
+ }
+
+ // fuse
+ return _mm256_castps_si256(
+ _mm256_insertf128_ps(
+ _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)),
+ _mm_castsi128_ps(res_hi),
+ 1));
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+ auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+ return _mm256_castps_si256(_mm256_permute2f128_ps(lo, hi, 0x31));
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+ auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+ return _mm256_castpd_si256(_mm256_permute2f128_pd(lo, hi, 0x31));
+ }
+ else
+ {
+ assert(false && "unsupported arch/op combination");
+ return {};
+ }
+ }
+ template <class A>
+ inline batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ auto lo = _mm256_unpacklo_ps(self, other);
+ auto hi = _mm256_unpackhi_ps(self, other);
+ return _mm256_permute2f128_ps(lo, hi, 0x31);
+ }
+ template <class A>
+ inline batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ auto lo = _mm256_unpacklo_pd(self, other);
+ auto hi = _mm256_unpackhi_pd(self, other);
+ return _mm256_permute2f128_pd(lo, hi, 0x31);
+ }
+
+ // zip_lo
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
+ {
+ // extract low word
+ __m128i self_lo = _mm256_extractf128_si256(self, 0);
+ __m128i other_lo = _mm256_extractf128_si256(other, 0);
+
+ // interleave
+ __m128i res_lo, res_hi;
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ res_lo = _mm_unpacklo_epi8(self_lo, other_lo);
+ res_hi = _mm_unpackhi_epi8(self_lo, other_lo);
+ }
+ else
+ {
+ res_lo = _mm_unpacklo_epi16(self_lo, other_lo);
+ res_hi = _mm_unpackhi_epi16(self_lo, other_lo);
+ }
+
+ // fuse
+ return _mm256_castps_si256(
+ _mm256_insertf128_ps(
+ _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)),
+ _mm_castsi128_ps(res_hi),
+ 1));
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+ auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+ return _mm256_castps_si256(_mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1));
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+ auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+ return _mm256_castpd_si256(_mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1));
+ }
+ else
+ {
+ assert(false && "unsupported arch/op combination");
+ return {};
+ }
+ }
+
+ template <class A>
+ inline batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+ {
+ auto lo = _mm256_unpacklo_ps(self, other);
+ auto hi = _mm256_unpackhi_ps(self, other);
+ return _mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1);
+ }
+ template <class A>
+ inline batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+ {
+ auto lo = _mm256_unpacklo_pd(self, other);
+ auto hi = _mm256_unpackhi_pd(self, other);
+ return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1);
+ }
+ }
+}
+
+#endif