diff options
Diffstat (limited to 'third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp')
-rw-r--r-- | third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp | 983 |
1 files changed, 983 insertions, 0 deletions
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp new file mode 100644 index 0000000000..a5b07ec9da --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp @@ -0,0 +1,983 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX2_HPP +#define XSIMD_AVX2_HPP + +#include <complex> +#include <type_traits> + +#include "../types/xsimd_avx2_register.hpp" + +namespace xsimd +{ + + namespace kernel + { + using namespace types; + + // abs + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx2>) noexcept + { + if (std::is_signed<T>::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_abs_epi8(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_abs_epi16(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_abs_epi32(self); + } + else + { + return abs(self, avx {}); + } + } + return self; + } + + // add + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_add_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_add_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_add_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_add_epi64(self, other); + } + else + { + return add(self, other, avx {}); + } + } + + // bitwise_and + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + return _mm256_and_si256(self, other); + } + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept + { + return _mm256_and_si256(self, other); + } + + // bitwise_andnot + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + return _mm256_andnot_si256(other, self); + } + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept + { + return _mm256_andnot_si256(other, self); + } + + // bitwise_not + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx2>) noexcept + { + return _mm256_xor_si256(self, _mm256_set1_epi32(-1)); + } + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept + { + return _mm256_xor_si256(self, _mm256_set1_epi32(-1)); + } + + // bitwise_lshift + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_slli_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_slli_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_slli_epi64(self, other); + } + else + { + return bitwise_lshift(self, other, avx {}); + } + } + + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_sllv_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_sllv_epi64(self, other); + } + else + { + return bitwise_lshift(self, other, avx {}); + } + } + + // bitwise_or + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + return _mm256_or_si256(self, other); + } + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept + { + return _mm256_or_si256(self, other); + } + + // bitwise_rshift + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept + { + if (std::is_signed<T>::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF); + __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self); + __m256i res = _mm256_srai_epi16(self, other); + return _mm256_or_si256( + detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); }, + sign_mask, cmp_is_negative), + _mm256_andnot_si256(sign_mask, res)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_srai_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_srai_epi32(self, other); + } + else + { + return bitwise_rshift(self, other, avx {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_srli_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_srli_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_srli_epi64(self, other); + } + else + { + return bitwise_rshift(self, other, avx {}); + } + } + } + + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + if (std::is_signed<T>::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_srav_epi32(self, other); + } + else + { + return bitwise_rshift(self, other, avx {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_srlv_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_srlv_epi64(self, other); + } + else + { + return bitwise_rshift(self, other, avx {}); + } + } + } + + // bitwise_xor + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + return _mm256_xor_si256(self, other); + } + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept + { + return _mm256_xor_si256(self, other); + } + + // complex_low + template <class A> + inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept + { + __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0)); + __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0)); + return _mm256_blend_pd(tmp0, tmp1, 10); + } + + // complex_high + template <class A> + inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept + { + __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2)); + __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0)); + return _mm256_blend_pd(tmp0, tmp1, 10); + } + + // fast_cast + namespace detail + { + + template <class A> + inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept + { + // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx + // adapted to avx + __m256i xH = _mm256_srli_epi64(x, 32); + xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); // 2^84 + __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, + 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000); + __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52 + __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 + return _mm256_add_pd(f, _mm256_castsi256_pd(xL)); + } + + template <class A> + inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept + { + // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx + // adapted to avx + __m256i xH = _mm256_srai_epi32(x, 16); + xH = _mm256_and_si256(xH, _mm256_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF)); + xH = _mm256_add_epi64(xH, _mm256_castpd_si256(_mm256_set1_pd(442721857769029238784.))); // 3*2^67 + __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, + 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000); + __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52 + __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52 + return _mm256_add_pd(f, _mm256_castsi256_pd(xL)); + } + } + + // eq + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_cmpeq_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_cmpeq_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_cmpeq_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_cmpeq_epi64(self, other); + } + else + { + return eq(self, other, avx {}); + } + } + + // gather + template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0> + inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index, + kernel::requires_arch<avx2>) noexcept + { + // scatter for this one is AVX512F+AVX512VL + return _mm256_i32gather_epi32(reinterpret_cast<const int*>(src), index, sizeof(T)); + } + + template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0> + inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index, + kernel::requires_arch<avx2>) noexcept + { + // scatter for this one is AVX512F+AVX512VL + return _mm256_i64gather_epi64(reinterpret_cast<const long long int*>(src), index, sizeof(T)); + } + + template <class A, class U, + detail::enable_sized_integral_t<U, 4> = 0> + inline batch<float, A> gather(batch<float, A> const&, float const* src, + batch<U, A> const& index, + kernel::requires_arch<avx2>) noexcept + { + // scatter for this one is AVX512F+AVX512VL + return _mm256_i32gather_ps(src, index, sizeof(float)); + } + + template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0> + inline batch<double, A> gather(batch<double, A> const&, double const* src, + batch<U, A> const& index, + requires_arch<avx2>) noexcept + { + // scatter for this one is AVX512F+AVX512VL + return _mm256_i64gather_pd(src, index, sizeof(double)); + } + + // gather: handmade conversions + template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0> + inline batch<float, A> gather(batch<float, A> const&, double const* src, + batch<V, A> const& index, + requires_arch<avx2>) noexcept + { + const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double))); + const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double))); + return detail::merge_sse(_mm256_cvtpd_ps(low.data), _mm256_cvtpd_ps(high.data)); + } + + template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0> + inline batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src, + batch<V, A> const& index, + requires_arch<avx2>) noexcept + { + const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double))); + const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double))); + return detail::merge_sse(_mm256_cvtpd_epi32(low.data), _mm256_cvtpd_epi32(high.data)); + } + + // lt + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + if (std::is_signed<T>::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_cmpgt_epi8(other, self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_cmpgt_epi16(other, self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_cmpgt_epi32(other, self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_cmpgt_epi64(other, self); + } + else + { + return lt(self, other, avx {}); + } + } + else + { + return lt(self, other, avx {}); + } + } + + // load_complex + template <class A> + inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx2>) noexcept + { + using batch_type = batch<float, A>; + batch_type real = _mm256_castpd_ps( + _mm256_permute4x64_pd( + _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))), + _MM_SHUFFLE(3, 1, 2, 0))); + batch_type imag = _mm256_castpd_ps( + _mm256_permute4x64_pd( + _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))), + _MM_SHUFFLE(3, 1, 2, 0))); + return { real, imag }; + } + template <class A> + inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx2>) noexcept + { + using batch_type = batch<double, A>; + batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0)); + batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0)); + return { real, imag }; + } + // mask + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + uint64_t mask8 = 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self); + return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4) | (detail::mask_lut(mask8 >> 16) << 8) | (detail::mask_lut(mask8 >> 24) << 12); + } + else + { + return mask(self, avx {}); + } + } + + // max + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + if (std::is_signed<T>::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_max_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_max_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_max_epi32(self, other); + } + else + { + return max(self, other, avx {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_max_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_max_epu16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_max_epu32(self, other); + } + else + { + return max(self, other, avx {}); + } + } + } + + // min + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + if (std::is_signed<T>::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_min_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_min_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_min_epi32(self, other); + } + else + { + return min(self, other, avx {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_min_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_min_epu16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_min_epu32(self, other); + } + else + { + return min(self, other, avx {}); + } + } + } + + // mul + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + __m256i mask_hi = _mm256_set1_epi32(0xFF00FF00); + __m256i res_lo = _mm256_mullo_epi16(self, other); + __m256i other_hi = _mm256_srli_epi16(other, 8); + __m256i self_hi = _mm256_and_si256(self, mask_hi); + __m256i res_hi = _mm256_mullo_epi16(self_hi, other_hi); + __m256i res = _mm256_blendv_epi8(res_lo, res_hi, mask_hi); + return res; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_mullo_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_mullo_epi32(self, other); + } + else + { + return mul(self, other, avx {}); + } + } + + // reduce_add + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + __m256i tmp1 = _mm256_hadd_epi32(self, self); + __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1); + __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1); + __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3); + return _mm_cvtsi128_si32(tmp4); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + __m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E); + __m256i tmp2 = _mm256_add_epi64(self, tmp1); + __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1); + __m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3); +#if defined(__x86_64__) + return _mm_cvtsi128_si64(res); +#else + __m128i m; + _mm_storel_epi64(&m, res); + int64_t i; + std::memcpy(&i, &m, sizeof(i)); + return i; +#endif + } + else + { + return reduce_add(self, avx {}); + } + } + + // rotate_right + template <size_t N, class A> + inline batch<uint16_t, A> rotate_right(batch<uint16_t, A> const& self, requires_arch<avx2>) noexcept + { + return _mm256_alignr_epi8(self, self, N); + } + template <size_t N, class A> + inline batch<int16_t, A> rotate_right(batch<int16_t, A> const& self, requires_arch<avx2>) noexcept + { + return bitwise_cast<int16_t>(rotate_right<N, A>(bitwise_cast<uint16_t>(self), avx2 {})); + } + + // sadd + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + if (std::is_signed<T>::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_adds_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_adds_epi16(self, other); + } + else + { + return sadd(self, other, avx {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_adds_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_adds_epu16(self, other); + } + else + { + return sadd(self, other, avx {}); + } + } + } + + // select + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_blendv_epi8(false_br, true_br, cond); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_blendv_epi8(false_br, true_br, cond); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_blendv_epi8(false_br, true_br, cond); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_blendv_epi8(false_br, true_br, cond); + } + else + { + return select(cond, true_br, false_br, avx {}); + } + } + template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept + { + constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask(); + // FIXME: for some reason mask here is not considered as an immediate, + // but it's okay for _mm256_blend_epi32 + // case 2: return _mm256_blend_epi16(false_br, true_br, mask); + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_blend_epi32(false_br, true_br, mask); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + constexpr int imask = detail::interleave(mask); + return _mm256_blend_epi32(false_br, true_br, imask); + } + else + { + return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {}); + } + } + + // slide_left + template <size_t N, class A, class T> + inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx2>) noexcept + { + constexpr unsigned BitCount = N * 8; + if (BitCount == 0) + { + return x; + } + if (BitCount >= 256) + { + return batch<T, A>(T(0)); + } + if (BitCount > 128) + { + constexpr unsigned M = (BitCount - 128) / 8; + auto y = _mm256_bslli_epi128(x, M); + return _mm256_permute2x128_si256(y, y, 0x28); + } + if (BitCount == 128) + { + return _mm256_permute2x128_si256(x, x, 0x28); + } + // shifting by [0, 128[ bits + constexpr unsigned M = BitCount / 8; + auto y = _mm256_bslli_epi128(x, M); + auto z = _mm256_bsrli_epi128(x, 16 - M); + auto w = _mm256_permute2x128_si256(z, z, 0x28); + return _mm256_or_si256(y, w); + } + + // slide_right + template <size_t N, class A, class T> + inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx2>) noexcept + { + constexpr unsigned BitCount = N * 8; + if (BitCount == 0) + { + return x; + } + if (BitCount >= 256) + { + return batch<T, A>(T(0)); + } + if (BitCount > 128) + { + constexpr unsigned M = (BitCount - 128) / 8; + auto y = _mm256_bsrli_epi128(x, M); + return _mm256_permute2x128_si256(y, y, 0x81); + } + if (BitCount == 128) + { + return _mm256_permute2x128_si256(x, x, 0x81); + } + // shifting by [0, 128[ bits + constexpr unsigned M = BitCount / 8; + auto y = _mm256_bsrli_epi128(x, M); + auto z = _mm256_bslli_epi128(x, 16 - M); + auto w = _mm256_permute2x128_si256(z, z, 0x81); + return _mm256_or_si256(y, w); + } + + // ssub + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + if (std::is_signed<T>::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_subs_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_subs_epi16(self, other); + } + else + { + return ssub(self, other, avx {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_subs_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_subs_epu16(self, other); + } + else + { + return ssub(self, other, avx {}); + } + } + } + + // sub + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_sub_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_sub_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_sub_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_sub_epi64(self, other); + } + else + { + return sub(self, other, avx {}); + } + } + + // swizzle (dynamic mask) + template <class A> + inline batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept + { + return _mm256_permutevar8x32_ps(self, mask); + } + + template <class A> + inline batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept + { + batch<uint32_t, A> broadcaster = { 0, 1, 0, 1, 0, 1, 0, 1 }; + constexpr uint64_t comb = 0x0000000100000001ul * 2; + return bitwise_cast<double>(swizzle(bitwise_cast<float>(self), bitwise_cast<uint32_t>(mask * comb) + broadcaster, avx2 {})); + } + + template <class A> + inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept + { + return bitwise_cast<uint64_t>(swizzle(bitwise_cast<double>(self), mask, avx2 {})); + } + template <class A> + inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept + { + return bitwise_cast<int64_t>(swizzle(bitwise_cast<double>(self), mask, avx2 {})); + } + template <class A> + inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept + { + return _mm256_permutevar8x32_epi32(self, mask); + } + template <class A> + inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept + { + return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {})); + } + + // swizzle (constant mask) + template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7> + inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept + { + return _mm256_permutevar8x32_ps(self, (batch<uint32_t, A>)mask); + } + + template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3> + inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept + { + constexpr auto mask = detail::shuffle(V0, V1, V2, V3); + return _mm256_permute4x64_pd(self, mask); + } + + template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3> + inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept + { + constexpr auto mask = detail::shuffle(V0, V1, V2, V3); + return _mm256_permute4x64_epi64(self, mask); + } + template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3> + inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept + { + return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx2 {})); + } + template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7> + inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept + { + return _mm256_permutevar8x32_epi32(self, (batch<uint32_t, A>)mask); + } + template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7> + inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept + { + return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {})); + } + + // zip_hi + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + auto lo = _mm256_unpacklo_epi8(self, other); + auto hi = _mm256_unpackhi_epi8(self, other); + return _mm256_permute2f128_si256(lo, hi, 0x31); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + auto lo = _mm256_unpacklo_epi16(self, other); + auto hi = _mm256_unpackhi_epi16(self, other); + return _mm256_permute2f128_si256(lo, hi, 0x31); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + auto lo = _mm256_unpacklo_epi32(self, other); + auto hi = _mm256_unpackhi_epi32(self, other); + return _mm256_permute2f128_si256(lo, hi, 0x31); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + auto lo = _mm256_unpacklo_epi64(self, other); + auto hi = _mm256_unpackhi_epi64(self, other); + return _mm256_permute2f128_si256(lo, hi, 0x31); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + + // zip_lo + template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> + inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + auto lo = _mm256_unpacklo_epi8(self, other); + auto hi = _mm256_unpackhi_epi8(self, other); + return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + auto lo = _mm256_unpacklo_epi16(self, other); + auto hi = _mm256_unpackhi_epi16(self, other); + return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + auto lo = _mm256_unpacklo_epi32(self, other); + auto hi = _mm256_unpackhi_epi32(self, other); + return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + auto lo = _mm256_unpacklo_epi64(self, other); + auto hi = _mm256_unpackhi_epi64(self, other); + return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + } +} + +#endif |