summaryrefslogtreecommitdiffstats
path: root/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp')
-rw-r--r--third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp212
1 files changed, 212 insertions, 0 deletions
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp
new file mode 100644
index 0000000000..7840ea8fc5
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp
@@ -0,0 +1,212 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
+ * Martin Renou *
+ * Copyright (c) QuantStack *
+ * Copyright (c) Serge Guelton *
+ * *
+ * Distributed under the terms of the BSD 3-Clause License. *
+ * *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512_DQHPP
+#define XSIMD_AVX512_D_HPP
+
+#include "../types/xsimd_avx512dq_register.hpp"
+
+namespace xsimd
+{
+
+ namespace kernel
+ {
+ using namespace types;
+
+ // bitwise_and
+ template <class A>
+ inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+ {
+ return _mm512_and_ps(self, other);
+ }
+ template <class A>
+ inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+ {
+ return _mm512_and_pd(self, other);
+ }
+
+ // bitwise_andnot
+ template <class A>
+ inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+ {
+ return _mm512_andnot_ps(other, self);
+ }
+ template <class A>
+ inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+ {
+ return _mm512_andnot_pd(other, self);
+ }
+
+ // bitwise_not
+ template <class A>
+ inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+ {
+ return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
+ }
+ template <class A>
+ inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+ {
+ return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1)));
+ }
+
+ // bitwise_or
+ template <class A>
+ inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+ {
+ return _mm512_or_ps(self, other);
+ }
+ template <class A>
+ inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+ {
+ return _mm512_or_pd(self, other);
+ }
+
+ template <class A, class T>
+ inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512dq>) noexcept
+ {
+ using register_type = typename batch_bool<T, A>::register_type;
+ return register_type(self.data | other.data);
+ }
+
+ // bitwise_xor
+ template <class A>
+ inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+ {
+ return _mm512_xor_ps(self, other);
+ }
+ template <class A>
+ inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+ {
+ return _mm512_xor_pd(self, other);
+ }
+
+ // haddp
+ template <class A>
+ inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512dq>) noexcept
+ {
+ // The following folds over the vector once:
+ // tmp1 = [a0..8, b0..8]
+ // tmp2 = [a8..f, b8..f]
+#define XSIMD_AVX512_HADDP_STEP1(I, a, b) \
+ batch<float, avx512f> res##I; \
+ { \
+ auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
+ auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
+ res##I = _mm512_add_ps(tmp1, tmp2); \
+ }
+
+ XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
+ XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]);
+ XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]);
+ XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]);
+ XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]);
+ XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]);
+ XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]);
+ XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]);
+
+#undef XSIMD_AVX512_HADDP_STEP1
+
+ // The following flds the code and shuffles so that hadd_ps produces the correct result
+ // tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3)
+ // tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4)
+ // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ...
+#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \
+ batch<float, avx2> halfx##I; \
+ { \
+ auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \
+ auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \
+ \
+ auto resx1 = _mm512_add_ps(tmp1, tmp2); \
+ \
+ auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \
+ auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \
+ \
+ auto resx2 = _mm512_add_ps(tmp3, tmp4); \
+ \
+ auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \
+ auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \
+ \
+ auto resx3 = _mm512_add_ps(tmp5, tmp6); \
+ \
+ halfx##I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0), \
+ _mm512_extractf32x8_ps(resx3, 1)); \
+ }
+
+ XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3);
+ XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7);
+
+#undef XSIMD_AVX512_HADDP_STEP2
+
+ auto concat = _mm512_castps256_ps512(halfx0);
+ concat = _mm512_insertf32x8(concat, halfx1, 1);
+ return concat;
+ }
+
+ // ldexp
+ template <class A>
+ inline batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512dq>) noexcept
+ {
+ return _mm512_scalef_pd(self, _mm512_cvtepi64_pd(other));
+ }
+
+ // mul
+ template <class A>
+ inline batch<uint64_t, A> mul(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512dq>) noexcept
+ {
+ return _mm512_mullo_epi64(self, other);
+ }
+
+ template <class A>
+ inline batch<int64_t, A> mul(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512dq>) noexcept
+ {
+ return _mm512_mullo_epi64(self, other);
+ }
+
+ // nearbyint_as_int
+ template <class A>
+ inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
+ requires_arch<avx512dq>) noexcept
+ {
+ return _mm512_cvtpd_epi64(self);
+ }
+
+ // reduce_add
+ template <class A>
+ inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
+ {
+ __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
+ __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
+ __m256 res1 = _mm256_add_ps(tmp1, tmp2);
+ return reduce_add(batch<float, avx2>(res1), avx2 {});
+ }
+
+ // convert
+ namespace detail
+ {
+ template <class A>
+ inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx512dq>) noexcept
+ {
+ return _mm512_cvtepi64_pd(self);
+ }
+
+ template <class A>
+ inline batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<avx512dq>) noexcept
+ {
+ return _mm512_cvttpd_epi64(self);
+ }
+
+ }
+
+ }
+
+}
+
+#endif