summaryrefslogtreecommitdiffstats
path: root/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp')
-rw-r--r--third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp983
1 files changed, 983 insertions, 0 deletions
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
new file mode 100644
index 0000000000..a5b07ec9da
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
@@ -0,0 +1,983 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
+ * Martin Renou *
+ * Copyright (c) QuantStack *
+ * Copyright (c) Serge Guelton *
+ * *
+ * Distributed under the terms of the BSD 3-Clause License. *
+ * *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX2_HPP
+#define XSIMD_AVX2_HPP
+
+#include <complex>
+#include <type_traits>
+
+#include "../types/xsimd_avx2_register.hpp"
+
+namespace xsimd
+{
+
+ namespace kernel
+ {
+ using namespace types;
+
+ // abs
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx2>) noexcept
+ {
+ if (std::is_signed<T>::value)
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm256_abs_epi8(self);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_abs_epi16(self);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_abs_epi32(self);
+ }
+ else
+ {
+ return abs(self, avx {});
+ }
+ }
+ return self;
+ }
+
+ // add
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm256_add_epi8(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_add_epi16(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_add_epi32(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ return _mm256_add_epi64(self, other);
+ }
+ else
+ {
+ return add(self, other, avx {});
+ }
+ }
+
+ // bitwise_and
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ return _mm256_and_si256(self, other);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ return _mm256_and_si256(self, other);
+ }
+
+ // bitwise_andnot
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ return _mm256_andnot_si256(other, self);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ return _mm256_andnot_si256(other, self);
+ }
+
+ // bitwise_not
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx2>) noexcept
+ {
+ return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
+ {
+ return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
+ }
+
+ // bitwise_lshift
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_slli_epi16(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_slli_epi32(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ return _mm256_slli_epi64(self, other);
+ }
+ else
+ {
+ return bitwise_lshift(self, other, avx {});
+ }
+ }
+
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_sllv_epi32(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ return _mm256_sllv_epi64(self, other);
+ }
+ else
+ {
+ return bitwise_lshift(self, other, avx {});
+ }
+ }
+
+ // bitwise_or
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ return _mm256_or_si256(self, other);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ return _mm256_or_si256(self, other);
+ }
+
+ // bitwise_rshift
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
+ {
+ if (std::is_signed<T>::value)
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF);
+ __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self);
+ __m256i res = _mm256_srai_epi16(self, other);
+ return _mm256_or_si256(
+ detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+ { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
+ sign_mask, cmp_is_negative),
+ _mm256_andnot_si256(sign_mask, res));
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_srai_epi16(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_srai_epi32(self, other);
+ }
+ else
+ {
+ return bitwise_rshift(self, other, avx {});
+ }
+ }
+ else
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_srli_epi16(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_srli_epi32(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ return _mm256_srli_epi64(self, other);
+ }
+ else
+ {
+ return bitwise_rshift(self, other, avx {});
+ }
+ }
+ }
+
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ if (std::is_signed<T>::value)
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_srav_epi32(self, other);
+ }
+ else
+ {
+ return bitwise_rshift(self, other, avx {});
+ }
+ }
+ else
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_srlv_epi32(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ return _mm256_srlv_epi64(self, other);
+ }
+ else
+ {
+ return bitwise_rshift(self, other, avx {});
+ }
+ }
+ }
+
+ // bitwise_xor
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ return _mm256_xor_si256(self, other);
+ }
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ return _mm256_xor_si256(self, other);
+ }
+
+ // complex_low
+ template <class A>
+ inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
+ {
+ __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0));
+ __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0));
+ return _mm256_blend_pd(tmp0, tmp1, 10);
+ }
+
+ // complex_high
+ template <class A>
+ inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
+ {
+ __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2));
+ __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0));
+ return _mm256_blend_pd(tmp0, tmp1, 10);
+ }
+
+ // fast_cast
+ namespace detail
+ {
+
+ template <class A>
+ inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
+ {
+ // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+ // adapted to avx
+ __m256i xH = _mm256_srli_epi64(x, 32);
+ xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); // 2^84
+ __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000,
+ 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
+ __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52
+ __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
+ return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
+ }
+
+ template <class A>
+ inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
+ {
+ // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+ // adapted to avx
+ __m256i xH = _mm256_srai_epi32(x, 16);
+ xH = _mm256_and_si256(xH, _mm256_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
+ xH = _mm256_add_epi64(xH, _mm256_castpd_si256(_mm256_set1_pd(442721857769029238784.))); // 3*2^67
+ __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000,
+ 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
+ __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52
+ __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
+ return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
+ }
+ }
+
+ // eq
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm256_cmpeq_epi8(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_cmpeq_epi16(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_cmpeq_epi32(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ return _mm256_cmpeq_epi64(self, other);
+ }
+ else
+ {
+ return eq(self, other, avx {});
+ }
+ }
+
+ // gather
+ template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
+ inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+ kernel::requires_arch<avx2>) noexcept
+ {
+ // scatter for this one is AVX512F+AVX512VL
+ return _mm256_i32gather_epi32(reinterpret_cast<const int*>(src), index, sizeof(T));
+ }
+
+ template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
+ inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+ kernel::requires_arch<avx2>) noexcept
+ {
+ // scatter for this one is AVX512F+AVX512VL
+ return _mm256_i64gather_epi64(reinterpret_cast<const long long int*>(src), index, sizeof(T));
+ }
+
+ template <class A, class U,
+ detail::enable_sized_integral_t<U, 4> = 0>
+ inline batch<float, A> gather(batch<float, A> const&, float const* src,
+ batch<U, A> const& index,
+ kernel::requires_arch<avx2>) noexcept
+ {
+ // scatter for this one is AVX512F+AVX512VL
+ return _mm256_i32gather_ps(src, index, sizeof(float));
+ }
+
+ template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
+ inline batch<double, A> gather(batch<double, A> const&, double const* src,
+ batch<U, A> const& index,
+ requires_arch<avx2>) noexcept
+ {
+ // scatter for this one is AVX512F+AVX512VL
+ return _mm256_i64gather_pd(src, index, sizeof(double));
+ }
+
+ // gather: handmade conversions
+ template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
+ inline batch<float, A> gather(batch<float, A> const&, double const* src,
+ batch<V, A> const& index,
+ requires_arch<avx2>) noexcept
+ {
+ const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
+ const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
+ return detail::merge_sse(_mm256_cvtpd_ps(low.data), _mm256_cvtpd_ps(high.data));
+ }
+
+ template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
+ inline batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
+ batch<V, A> const& index,
+ requires_arch<avx2>) noexcept
+ {
+ const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
+ const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
+ return detail::merge_sse(_mm256_cvtpd_epi32(low.data), _mm256_cvtpd_epi32(high.data));
+ }
+
+ // lt
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ if (std::is_signed<T>::value)
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm256_cmpgt_epi8(other, self);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_cmpgt_epi16(other, self);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_cmpgt_epi32(other, self);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ return _mm256_cmpgt_epi64(other, self);
+ }
+ else
+ {
+ return lt(self, other, avx {});
+ }
+ }
+ else
+ {
+ return lt(self, other, avx {});
+ }
+ }
+
+ // load_complex
+ template <class A>
+ inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx2>) noexcept
+ {
+ using batch_type = batch<float, A>;
+ batch_type real = _mm256_castpd_ps(
+ _mm256_permute4x64_pd(
+ _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))),
+ _MM_SHUFFLE(3, 1, 2, 0)));
+ batch_type imag = _mm256_castpd_ps(
+ _mm256_permute4x64_pd(
+ _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))),
+ _MM_SHUFFLE(3, 1, 2, 0)));
+ return { real, imag };
+ }
+ template <class A>
+ inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx2>) noexcept
+ {
+ using batch_type = batch<double, A>;
+ batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
+ batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
+ return { real, imag };
+ }
+ // mask
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ uint64_t mask8 = 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
+ return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4) | (detail::mask_lut(mask8 >> 16) << 8) | (detail::mask_lut(mask8 >> 24) << 12);
+ }
+ else
+ {
+ return mask(self, avx {});
+ }
+ }
+
+ // max
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ if (std::is_signed<T>::value)
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm256_max_epi8(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_max_epi16(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_max_epi32(self, other);
+ }
+ else
+ {
+ return max(self, other, avx {});
+ }
+ }
+ else
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm256_max_epu8(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_max_epu16(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_max_epu32(self, other);
+ }
+ else
+ {
+ return max(self, other, avx {});
+ }
+ }
+ }
+
+ // min
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ if (std::is_signed<T>::value)
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm256_min_epi8(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_min_epi16(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_min_epi32(self, other);
+ }
+ else
+ {
+ return min(self, other, avx {});
+ }
+ }
+ else
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm256_min_epu8(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_min_epu16(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_min_epu32(self, other);
+ }
+ else
+ {
+ return min(self, other, avx {});
+ }
+ }
+ }
+
+ // mul
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ __m256i mask_hi = _mm256_set1_epi32(0xFF00FF00);
+ __m256i res_lo = _mm256_mullo_epi16(self, other);
+ __m256i other_hi = _mm256_srli_epi16(other, 8);
+ __m256i self_hi = _mm256_and_si256(self, mask_hi);
+ __m256i res_hi = _mm256_mullo_epi16(self_hi, other_hi);
+ __m256i res = _mm256_blendv_epi8(res_lo, res_hi, mask_hi);
+ return res;
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_mullo_epi16(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_mullo_epi32(self, other);
+ }
+ else
+ {
+ return mul(self, other, avx {});
+ }
+ }
+
+ // reduce_add
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ __m256i tmp1 = _mm256_hadd_epi32(self, self);
+ __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1);
+ __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
+ __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3);
+ return _mm_cvtsi128_si32(tmp4);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ __m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E);
+ __m256i tmp2 = _mm256_add_epi64(self, tmp1);
+ __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
+ __m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3);
+#if defined(__x86_64__)
+ return _mm_cvtsi128_si64(res);
+#else
+ __m128i m;
+ _mm_storel_epi64(&m, res);
+ int64_t i;
+ std::memcpy(&i, &m, sizeof(i));
+ return i;
+#endif
+ }
+ else
+ {
+ return reduce_add(self, avx {});
+ }
+ }
+
+ // rotate_right
+ template <size_t N, class A>
+ inline batch<uint16_t, A> rotate_right(batch<uint16_t, A> const& self, requires_arch<avx2>) noexcept
+ {
+ return _mm256_alignr_epi8(self, self, N);
+ }
+ template <size_t N, class A>
+ inline batch<int16_t, A> rotate_right(batch<int16_t, A> const& self, requires_arch<avx2>) noexcept
+ {
+ return bitwise_cast<int16_t>(rotate_right<N, A>(bitwise_cast<uint16_t>(self), avx2 {}));
+ }
+
+ // sadd
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ if (std::is_signed<T>::value)
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm256_adds_epi8(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_adds_epi16(self, other);
+ }
+ else
+ {
+ return sadd(self, other, avx {});
+ }
+ }
+ else
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm256_adds_epu8(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_adds_epu16(self, other);
+ }
+ else
+ {
+ return sadd(self, other, avx {});
+ }
+ }
+ }
+
+ // select
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm256_blendv_epi8(false_br, true_br, cond);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_blendv_epi8(false_br, true_br, cond);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_blendv_epi8(false_br, true_br, cond);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ return _mm256_blendv_epi8(false_br, true_br, cond);
+ }
+ else
+ {
+ return select(cond, true_br, false_br, avx {});
+ }
+ }
+ template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
+ {
+ constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
+ // FIXME: for some reason mask here is not considered as an immediate,
+ // but it's okay for _mm256_blend_epi32
+ // case 2: return _mm256_blend_epi16(false_br, true_br, mask);
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_blend_epi32(false_br, true_br, mask);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ constexpr int imask = detail::interleave(mask);
+ return _mm256_blend_epi32(false_br, true_br, imask);
+ }
+ else
+ {
+ return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
+ }
+ }
+
+ // slide_left
+ template <size_t N, class A, class T>
+ inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx2>) noexcept
+ {
+ constexpr unsigned BitCount = N * 8;
+ if (BitCount == 0)
+ {
+ return x;
+ }
+ if (BitCount >= 256)
+ {
+ return batch<T, A>(T(0));
+ }
+ if (BitCount > 128)
+ {
+ constexpr unsigned M = (BitCount - 128) / 8;
+ auto y = _mm256_bslli_epi128(x, M);
+ return _mm256_permute2x128_si256(y, y, 0x28);
+ }
+ if (BitCount == 128)
+ {
+ return _mm256_permute2x128_si256(x, x, 0x28);
+ }
+ // shifting by [0, 128[ bits
+ constexpr unsigned M = BitCount / 8;
+ auto y = _mm256_bslli_epi128(x, M);
+ auto z = _mm256_bsrli_epi128(x, 16 - M);
+ auto w = _mm256_permute2x128_si256(z, z, 0x28);
+ return _mm256_or_si256(y, w);
+ }
+
+ // slide_right
+ template <size_t N, class A, class T>
+ inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx2>) noexcept
+ {
+ constexpr unsigned BitCount = N * 8;
+ if (BitCount == 0)
+ {
+ return x;
+ }
+ if (BitCount >= 256)
+ {
+ return batch<T, A>(T(0));
+ }
+ if (BitCount > 128)
+ {
+ constexpr unsigned M = (BitCount - 128) / 8;
+ auto y = _mm256_bsrli_epi128(x, M);
+ return _mm256_permute2x128_si256(y, y, 0x81);
+ }
+ if (BitCount == 128)
+ {
+ return _mm256_permute2x128_si256(x, x, 0x81);
+ }
+ // shifting by [0, 128[ bits
+ constexpr unsigned M = BitCount / 8;
+ auto y = _mm256_bsrli_epi128(x, M);
+ auto z = _mm256_bslli_epi128(x, 16 - M);
+ auto w = _mm256_permute2x128_si256(z, z, 0x81);
+ return _mm256_or_si256(y, w);
+ }
+
+ // ssub
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ if (std::is_signed<T>::value)
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm256_subs_epi8(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_subs_epi16(self, other);
+ }
+ else
+ {
+ return ssub(self, other, avx {});
+ }
+ }
+ else
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm256_subs_epu8(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_subs_epu16(self, other);
+ }
+ else
+ {
+ return ssub(self, other, avx {});
+ }
+ }
+ }
+
+ // sub
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm256_sub_epi8(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_sub_epi16(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ return _mm256_sub_epi32(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ return _mm256_sub_epi64(self, other);
+ }
+ else
+ {
+ return sub(self, other, avx {});
+ }
+ }
+
+ // swizzle (dynamic mask)
+ template <class A>
+ inline batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
+ {
+ return _mm256_permutevar8x32_ps(self, mask);
+ }
+
+ template <class A>
+ inline batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
+ {
+ batch<uint32_t, A> broadcaster = { 0, 1, 0, 1, 0, 1, 0, 1 };
+ constexpr uint64_t comb = 0x0000000100000001ul * 2;
+ return bitwise_cast<double>(swizzle(bitwise_cast<float>(self), bitwise_cast<uint32_t>(mask * comb) + broadcaster, avx2 {}));
+ }
+
+ template <class A>
+ inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
+ {
+ return bitwise_cast<uint64_t>(swizzle(bitwise_cast<double>(self), mask, avx2 {}));
+ }
+ template <class A>
+ inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
+ {
+ return bitwise_cast<int64_t>(swizzle(bitwise_cast<double>(self), mask, avx2 {}));
+ }
+ template <class A>
+ inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
+ {
+ return _mm256_permutevar8x32_epi32(self, mask);
+ }
+ template <class A>
+ inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
+ {
+ return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
+ }
+
+ // swizzle (constant mask)
+ template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+ inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+ {
+ return _mm256_permutevar8x32_ps(self, (batch<uint32_t, A>)mask);
+ }
+
+ template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+ inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+ {
+ constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
+ return _mm256_permute4x64_pd(self, mask);
+ }
+
+ template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+ inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+ {
+ constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
+ return _mm256_permute4x64_epi64(self, mask);
+ }
+ template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+ inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
+ {
+ return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx2 {}));
+ }
+ template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+ inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+ {
+ return _mm256_permutevar8x32_epi32(self, (batch<uint32_t, A>)mask);
+ }
+ template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+ inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+ {
+ return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
+ }
+
+ // zip_hi
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ auto lo = _mm256_unpacklo_epi8(self, other);
+ auto hi = _mm256_unpackhi_epi8(self, other);
+ return _mm256_permute2f128_si256(lo, hi, 0x31);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ auto lo = _mm256_unpacklo_epi16(self, other);
+ auto hi = _mm256_unpackhi_epi16(self, other);
+ return _mm256_permute2f128_si256(lo, hi, 0x31);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ auto lo = _mm256_unpacklo_epi32(self, other);
+ auto hi = _mm256_unpackhi_epi32(self, other);
+ return _mm256_permute2f128_si256(lo, hi, 0x31);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ auto lo = _mm256_unpacklo_epi64(self, other);
+ auto hi = _mm256_unpackhi_epi64(self, other);
+ return _mm256_permute2f128_si256(lo, hi, 0x31);
+ }
+ else
+ {
+ assert(false && "unsupported arch/op combination");
+ return {};
+ }
+ }
+
+ // zip_lo
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ auto lo = _mm256_unpacklo_epi8(self, other);
+ auto hi = _mm256_unpackhi_epi8(self, other);
+ return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ auto lo = _mm256_unpacklo_epi16(self, other);
+ auto hi = _mm256_unpackhi_epi16(self, other);
+ return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+ {
+ auto lo = _mm256_unpacklo_epi32(self, other);
+ auto hi = _mm256_unpackhi_epi32(self, other);
+ return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+ {
+ auto lo = _mm256_unpacklo_epi64(self, other);
+ auto hi = _mm256_unpackhi_epi64(self, other);
+ return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+ }
+ else
+ {
+ assert(false && "unsupported arch/op combination");
+ return {};
+ }
+ }
+ }
+}
+
+#endif