diff options
Diffstat (limited to 'third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp')
-rw-r--r-- | third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp | 212 |
1 files changed, 212 insertions, 0 deletions
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp new file mode 100644 index 0000000000..7840ea8fc5 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp @@ -0,0 +1,212 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512_DQHPP +#define XSIMD_AVX512_D_HPP + +#include "../types/xsimd_avx512dq_register.hpp" + +namespace xsimd +{ + + namespace kernel + { + using namespace types; + + // bitwise_and + template <class A> + inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept + { + return _mm512_and_ps(self, other); + } + template <class A> + inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept + { + return _mm512_and_pd(self, other); + } + + // bitwise_andnot + template <class A> + inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept + { + return _mm512_andnot_ps(other, self); + } + template <class A> + inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept + { + return _mm512_andnot_pd(other, self); + } + + // bitwise_not + template <class A> + inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept + { + return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1))); + } + template <class A> + inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept + { + return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1))); + } + + // bitwise_or + template <class A> + inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept + { + return _mm512_or_ps(self, other); + } + template <class A> + inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept + { + return _mm512_or_pd(self, other); + } + + template <class A, class T> + inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512dq>) noexcept + { + using register_type = typename batch_bool<T, A>::register_type; + return register_type(self.data | other.data); + } + + // bitwise_xor + template <class A> + inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept + { + return _mm512_xor_ps(self, other); + } + template <class A> + inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept + { + return _mm512_xor_pd(self, other); + } + + // haddp + template <class A> + inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512dq>) noexcept + { + // The following folds over the vector once: + // tmp1 = [a0..8, b0..8] + // tmp2 = [a8..f, b8..f] +#define XSIMD_AVX512_HADDP_STEP1(I, a, b) \ + batch<float, avx512f> res##I; \ + { \ + auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ + auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ + res##I = _mm512_add_ps(tmp1, tmp2); \ + } + + XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]); + XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]); + XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]); + XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]); + XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]); + XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]); + XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]); + XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]); + +#undef XSIMD_AVX512_HADDP_STEP1 + + // The following flds the code and shuffles so that hadd_ps produces the correct result + // tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3) + // tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4) + // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ... +#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \ + batch<float, avx2> halfx##I; \ + { \ + auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ + auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ + \ + auto resx1 = _mm512_add_ps(tmp1, tmp2); \ + \ + auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ + auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ + \ + auto resx2 = _mm512_add_ps(tmp3, tmp4); \ + \ + auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \ + auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \ + \ + auto resx3 = _mm512_add_ps(tmp5, tmp6); \ + \ + halfx##I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0), \ + _mm512_extractf32x8_ps(resx3, 1)); \ + } + + XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3); + XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7); + +#undef XSIMD_AVX512_HADDP_STEP2 + + auto concat = _mm512_castps256_ps512(halfx0); + concat = _mm512_insertf32x8(concat, halfx1, 1); + return concat; + } + + // ldexp + template <class A> + inline batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512dq>) noexcept + { + return _mm512_scalef_pd(self, _mm512_cvtepi64_pd(other)); + } + + // mul + template <class A> + inline batch<uint64_t, A> mul(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512dq>) noexcept + { + return _mm512_mullo_epi64(self, other); + } + + template <class A> + inline batch<int64_t, A> mul(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512dq>) noexcept + { + return _mm512_mullo_epi64(self, other); + } + + // nearbyint_as_int + template <class A> + inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self, + requires_arch<avx512dq>) noexcept + { + return _mm512_cvtpd_epi64(self); + } + + // reduce_add + template <class A> + inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept + { + __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1); + __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0); + __m256 res1 = _mm256_add_ps(tmp1, tmp2); + return reduce_add(batch<float, avx2>(res1), avx2 {}); + } + + // convert + namespace detail + { + template <class A> + inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx512dq>) noexcept + { + return _mm512_cvtepi64_pd(self); + } + + template <class A> + inline batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<avx512dq>) noexcept + { + return _mm512_cvttpd_epi64(self); + } + + } + + } + +} + +#endif |