From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- third_party/xsimd/Changelog.rst | 151 ++ third_party/xsimd/LICENSE | 29 + .../arch/generic/xsimd_generic_arithmetic.hpp | 152 ++ .../xsimd/arch/generic/xsimd_generic_complex.hpp | 96 + .../xsimd/arch/generic/xsimd_generic_details.hpp | 239 ++ .../xsimd/arch/generic/xsimd_generic_logical.hpp | 163 ++ .../xsimd/arch/generic/xsimd_generic_math.hpp | 2418 ++++++++++++++++++ .../xsimd/arch/generic/xsimd_generic_memory.hpp | 397 +++ .../xsimd/arch/generic/xsimd_generic_rounding.hpp | 72 + .../xsimd/arch/generic/xsimd_generic_trigo.hpp | 969 +++++++ third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp | 1657 ++++++++++++ .../xsimd/include/xsimd/arch/xsimd_avx2.hpp | 950 +++++++ .../xsimd/include/xsimd/arch/xsimd_avx512bw.hpp | 627 +++++ .../xsimd/include/xsimd/arch/xsimd_avx512cd.hpp | 28 + .../xsimd/include/xsimd/arch/xsimd_avx512dq.hpp | 212 ++ .../xsimd/include/xsimd/arch/xsimd_avx512f.hpp | 2028 +++++++++++++++ .../xsimd/include/xsimd/arch/xsimd_constants.hpp | 384 +++ .../xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp | 80 + .../xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp | 46 + .../xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp | 79 + .../xsimd/include/xsimd/arch/xsimd_fma4.hpp | 79 + .../xsimd/include/xsimd/arch/xsimd_generic.hpp | 23 + .../xsimd/include/xsimd/arch/xsimd_generic_fwd.hpp | 38 + third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp | 86 + .../xsimd/include/xsimd/arch/xsimd_neon.hpp | 2670 ++++++++++++++++++++ .../xsimd/include/xsimd/arch/xsimd_neon64.hpp | 1322 ++++++++++ .../xsimd/include/xsimd/arch/xsimd_scalar.hpp | 1043 ++++++++ .../xsimd/include/xsimd/arch/xsimd_sse2.hpp | 1695 +++++++++++++ .../xsimd/include/xsimd/arch/xsimd_sse3.hpp | 64 + .../xsimd/include/xsimd/arch/xsimd_sse4_1.hpp | 350 +++ .../xsimd/include/xsimd/arch/xsimd_sse4_2.hpp | 44 + .../xsimd/include/xsimd/arch/xsimd_ssse3.hpp | 142 ++ third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp | 1126 +++++++++ .../xsimd/include/xsimd/config/xsimd_arch.hpp | 249 ++ .../xsimd/include/xsimd/config/xsimd_config.hpp | 350 +++ .../xsimd/include/xsimd/config/xsimd_cpuid.hpp | 181 ++ .../xsimd/include/xsimd/math/xsimd_rem_pio2.hpp | 719 ++++++ .../xsimd/memory/xsimd_aligned_allocator.hpp | 349 +++ .../xsimd/include/xsimd/memory/xsimd_alignment.hpp | 91 + .../include/xsimd/types/xsimd_all_registers.hpp | 32 + .../xsimd/include/xsimd/types/xsimd_api.hpp | 2310 +++++++++++++++++ .../include/xsimd/types/xsimd_avx2_register.hpp | 40 + .../xsimd/types/xsimd_avx512bw_register.hpp | 48 + .../xsimd/types/xsimd_avx512cd_register.hpp | 48 + .../xsimd/types/xsimd_avx512dq_register.hpp | 48 + .../include/xsimd/types/xsimd_avx512f_register.hpp | 75 + .../include/xsimd/types/xsimd_avx_register.hpp | 62 + .../xsimd/include/xsimd/types/xsimd_batch.hpp | 1491 +++++++++++ .../include/xsimd/types/xsimd_batch_constant.hpp | 147 ++ .../xsimd/types/xsimd_fma3_avx2_register.hpp | 46 + .../xsimd/types/xsimd_fma3_avx_register.hpp | 46 + .../xsimd/types/xsimd_fma3_sse_register.hpp | 46 + .../include/xsimd/types/xsimd_fma4_register.hpp | 42 + .../include/xsimd/types/xsimd_generic_arch.hpp | 35 + .../include/xsimd/types/xsimd_neon64_register.hpp | 52 + .../include/xsimd/types/xsimd_neon_register.hpp | 155 ++ .../xsimd/include/xsimd/types/xsimd_register.hpp | 94 + .../include/xsimd/types/xsimd_sse2_register.hpp | 61 + .../include/xsimd/types/xsimd_sse3_register.hpp | 45 + .../include/xsimd/types/xsimd_sse4_1_register.hpp | 44 + .../include/xsimd/types/xsimd_sse4_2_register.hpp | 44 + .../include/xsimd/types/xsimd_ssse3_register.hpp | 44 + .../include/xsimd/types/xsimd_sve_register.hpp | 155 ++ .../xsimd/include/xsimd/types/xsimd_traits.hpp | 319 +++ .../xsimd/include/xsimd/types/xsimd_utils.hpp | 530 ++++ third_party/xsimd/include/xsimd/xsimd.hpp | 68 + third_party/xsimd/moz.yaml | 37 + 67 files changed, 27562 insertions(+) create mode 100644 third_party/xsimd/Changelog.rst create mode 100644 third_party/xsimd/LICENSE create mode 100644 third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_complex.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_logical.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_rounding.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_trigo.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_fma4.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_generic.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_generic_fwd.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp create mode 100644 third_party/xsimd/include/xsimd/config/xsimd_arch.hpp create mode 100644 third_party/xsimd/include/xsimd/config/xsimd_config.hpp create mode 100644 third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp create mode 100644 third_party/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp create mode 100644 third_party/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp create mode 100644 third_party/xsimd/include/xsimd/memory/xsimd_alignment.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_api.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_avx2_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_batch.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_fma4_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_neon64_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_neon_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_sse3_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_traits.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_utils.hpp create mode 100644 third_party/xsimd/include/xsimd/xsimd.hpp create mode 100644 third_party/xsimd/moz.yaml (limited to 'third_party/xsimd') diff --git a/third_party/xsimd/Changelog.rst b/third_party/xsimd/Changelog.rst new file mode 100644 index 0000000000..fa4acbce37 --- /dev/null +++ b/third_party/xsimd/Changelog.rst @@ -0,0 +1,151 @@ +.. Copyright (c) Serge Guelton and Johan Mabille + Copyright (c) QuantStack + + Distributed under the terms of the BSD 3-Clause License. + + The full license is in the file LICENSE, distributed with this software. + + +Changelog +========= + +9.0.1 +----- + + * Fix potential ABI issue in SVE support, making ``xsimd::sve`` a type alias to + size-dependent type. + +9.0.0 +----- + + * Support fixed size SVE + + * Fix a bug in SSSE3 ``xsimd::swizzle`` implementation for ``int8`` and ``int16`` + + * Rename ``xsimd::hadd`` into ``xsimd::reduce_add``, provide ``xsimd::reduce_min`` and ``xsimd::reduce_max`` + + * Properly report unsupported double for neon on arm32 + + * Fill holes in xsimd scalar api + + * Fix ``find_package(xsimd)`` for xtl enabled xsimd + + * Replace ``xsimd::bool_cast`` by ``xsimd::batch_bool_cast`` + + * Native ``xsimd::hadd`` for float on arm64 + + * Properly static_assert when trying to instantiate an ``xsimd::batch`` of xtl complex + + * Introduce ``xsimd::batch_bool::mask()`` and ``batch_bool::from_mask(...)`` + + * Flag some function with ``[[nodiscard]]`` + + * Accept both relative and absolute libdir and include dir in xsimd.pc + + * Implement ``xsimd::nearbyint_as_int`` for NEON + + * Add ``xsimd::polar`` + + * Speedup double -> F32/I32 gathers + + * Add ``xsimd::slide_left`` and ``xsimd::slide_right`` + + * Support integral ``xsimd::swizzles`` on AVX + +8.1.0 +----- + + * Add ``xsimd::gather`` and ``xsimd::scatter`` + + * Add ``xsimd::nearbyint_as_int`` + + * Add ``xsimd::none`` + + * Add ``xsimd::reciprocal`` + + * Remove batch constructor from memory adress, use ``xsimd::batch<...>::load_(un)aligned`` instead + + * Leave to msvc users the opportunity to manually disable FMA3 on AVX + + * Provide ``xsimd::insert`` to modify a single value from a vector + + * Make ``xsimd::pow`` implementation resilient to ``FE_INVALID`` + + * Reciprocal square root support through ``xsimd::rsqrt`` + + * NEON: Improve ``xsimd::any`` and ``xsimd::all`` + + * Provide type utility to explicitly require a batch of given size and type + + * Implement ``xsimd::swizzle`` on x86, neon and neon64 + + * Avx support for ``xsimd::zip_lo`` and ``xsimd::zip_hi`` + + * Only use ``_mm256_unpacklo_epi`` on AVX2 + + * Provide neon/neon64 conversion function from ``uint(32|64)_t`` to ``(float|double)`` + + * Provide SSE/AVX/AVX2 conversion function from ``uint32_t`` to ``float`` + + * Provide AVX2 conversion function from ``(u)int64_t`` to ``double`` + + * Provide better SSE conversion function from ``uint64_t`` to ``double`` + + * Provide better SSE conversion function to ``double`` + + * Support logical xor for ``xsimd::batch_bool`` + + * Clarify fma support: + + - FMA3 + SSE -> ``xsimd::fma3`` + - FMA3 + AVX -> ``xsimd::fma3`` + - FMA3 + AVX2 -> ``xsimd::fma3`` + - FMA4 -> ``xsimd::fma4`` + + * Allow ``xsimd::transform`` to work with complex types + + * Add missing scalar version of ``xsimd::norm`` and ``xsimd::conj`` + +8.0.5 +----- + + * Fix neon ``xsimd::hadd`` implementation + + * Detect unsupported architectures and set ``XSIMD_NO_SUPPORTED_ARCHITECTURE`` + if needs be + +8.0.4 +----- + + * Provide some conversion operators for ``float`` -> ``uint32`` + + * Improve code generated for AVX2 signed integer comparisons + + * Enable detection of avx512cd and avx512dq, and fix avx512bw detection + + * Enable detection of AVX2+FMA + + * Pick the best compatible architecture in ``xsimd::dispatch`` + + * Enables support for FMA when AVX2 is detected on Windows + + * Add missing includes / forward declaration + + * Mark all functions inline and noexcept + + * Assert when using incomplete ``std::initializer_list`` + +8.0.3 +----- + + * Improve CI & testing, no functional change + +8.0.2 +----- + + * Do not use ``_mm256_srai_epi32`` under AVX, it's an AVX2 instruction + +8.0.1 +----- + + * Fix invalid constexpr ``std::make_tuple`` usage in neon64 diff --git a/third_party/xsimd/LICENSE b/third_party/xsimd/LICENSE new file mode 100644 index 0000000000..eee7a54bc9 --- /dev/null +++ b/third_party/xsimd/LICENSE @@ -0,0 +1,29 @@ +Copyright (c) 2016, Johan Mabille, Sylvain Corlay, Wolf Vollprecht and Martin Renou +Copyright (c) 2016, QuantStack +Copyright (c) 2018, Serge Guelton +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp new file mode 100644 index 0000000000..5b3fef6623 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp @@ -0,0 +1,152 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_GENERIC_ARITHMETIC_HPP +#define XSIMD_GENERIC_ARITHMETIC_HPP + +#include +#include + +#include "./xsimd_generic_details.hpp" + +namespace xsimd +{ + + namespace kernel + { + + using namespace types; + + // bitwise_lshift + template ::value, void>::type*/> + inline batch bitwise_lshift(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::apply([](T x, T y) noexcept + { return x << y; }, + self, other); + } + + // bitwise_rshift + template ::value, void>::type*/> + inline batch bitwise_rshift(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::apply([](T x, T y) noexcept + { return x >> y; }, + self, other); + } + + // div + template ::value, void>::type> + inline batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::apply([](T x, T y) noexcept -> T + { return x / y; }, + self, other); + } + + // fma + template + inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return x * y + z; + } + + template + inline batch, A> fma(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept + { + auto res_r = fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real())); + auto res_i = fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag())); + return { res_r, res_i }; + } + + // fms + template + inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return x * y - z; + } + + template + inline batch, A> fms(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept + { + auto res_r = fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real())); + auto res_i = fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag())); + return { res_r, res_i }; + } + + // fnma + template + inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return -x * y + z; + } + + template + inline batch, A> fnma(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept + { + auto res_r = -fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real())); + auto res_i = -fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag())); + return { res_r, res_i }; + } + + // fnms + template + inline batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return -x * y - z; + } + + template + inline batch, A> fnms(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept + { + auto res_r = -fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real())); + auto res_i = -fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag())); + return { res_r, res_i }; + } + + // mul + template ::value, void>::type*/> + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::apply([](T x, T y) noexcept -> T + { return x * y; }, + self, other); + } + + // sadd + template + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + return add(self, other); // no saturated arithmetic on floating point numbers + } + template + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + return add(self, other); // no saturated arithmetic on floating point numbers + } + + // ssub + template + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + return sub(self, other); // no saturated arithmetic on floating point numbers + } + template + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + return sub(self, other); // no saturated arithmetic on floating point numbers + } + + } + +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_complex.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_complex.hpp new file mode 100644 index 0000000000..ede95ee937 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_complex.hpp @@ -0,0 +1,96 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_GENERIC_COMPLEX_HPP +#define XSIMD_GENERIC_COMPLEX_HPP + +#include + +#include "./xsimd_generic_details.hpp" + +namespace xsimd +{ + + namespace kernel + { + + using namespace types; + + // real + template + inline batch real(batch const& self, requires_arch) noexcept + { + return self; + } + + template + inline batch real(batch, A> const& self, requires_arch) noexcept + { + return self.real(); + } + + // imag + template + inline batch imag(batch const& /*self*/, requires_arch) noexcept + { + return batch(T(0)); + } + + template + inline batch imag(batch, A> const& self, requires_arch) noexcept + { + return self.imag(); + } + + // arg + template + inline real_batch_type_t> arg(batch const& self, requires_arch) noexcept + { + return atan2(imag(self), real(self)); + } + + // conj + template + inline complex_batch_type_t> conj(batch const& self, requires_arch) noexcept + { + return { real(self), -imag(self) }; + } + + // norm + template + inline real_batch_type_t> norm(batch const& self, requires_arch) noexcept + { + return { fma(real(self), real(self), imag(self) * imag(self)) }; + } + + // proj + template + inline complex_batch_type_t> proj(batch const& self, requires_arch) noexcept + { + using batch_type = complex_batch_type_t>; + using real_batch = typename batch_type::real_batch; + using real_value_type = typename real_batch::value_type; + auto cond = xsimd::isinf(real(self)) || xsimd::isinf(imag(self)); + return select(cond, + batch_type(constants::infinity(), + copysign(real_batch(real_value_type(0)), imag(self))), + batch_type(self)); + } + + template + inline batch_bool isnan(batch, A> const& self, requires_arch) noexcept + { + return batch_bool(isnan(self.real()) || isnan(self.imag())); + } + } +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp new file mode 100644 index 0000000000..fd66e5d03c --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp @@ -0,0 +1,239 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_GENERIC_DETAILS_HPP +#define XSIMD_GENERIC_DETAILS_HPP + +#include + +#include "../../math/xsimd_rem_pio2.hpp" +#include "../../types/xsimd_generic_arch.hpp" +#include "../../types/xsimd_utils.hpp" +#include "../xsimd_constants.hpp" + +namespace xsimd +{ + // Forward declaration. Should we put them in a separate file? + template + inline batch abs(batch const& self) noexcept; + template + inline batch abs(batch, A> const& self) noexcept; + template + inline bool any(batch_bool const& self) noexcept; + template + inline batch atan2(batch const& self, batch const& other) noexcept; + template + inline batch batch_cast(batch const&, batch const& out) noexcept; + template + inline batch bitofsign(batch const& self) noexcept; + template + inline batch bitwise_cast(batch const& self) noexcept; + template + inline batch cos(batch const& self) noexcept; + template + inline batch cosh(batch const& self) noexcept; + template + inline batch exp(batch const& self) noexcept; + template + inline batch fma(batch const& x, batch const& y, batch const& z) noexcept; + template + inline batch fms(batch const& x, batch const& y, batch const& z) noexcept; + template + inline batch frexp(const batch& x, const batch, A>& e) noexcept; + template + inline batch horner(const batch& self) noexcept; + template + inline batch hypot(const batch& self) noexcept; + template + inline batch_bool is_even(batch const& self) noexcept; + template + inline batch_bool is_flint(batch const& self) noexcept; + template + inline batch_bool is_odd(batch const& self) noexcept; + template + inline batch_bool isinf(batch const& self) noexcept; + template + inline typename batch::batch_bool_type isnan(batch const& self) noexcept; + template + inline batch ldexp(const batch& x, const batch, A>& e) noexcept; + template + inline batch log(batch const& self) noexcept; + template + inline batch nearbyint(batch const& self) noexcept; + template + inline batch, A> nearbyint_as_int(const batch& x) noexcept; + template + inline T reduce_add(batch const&) noexcept; + template + inline batch select(batch_bool const&, batch const&, batch const&) noexcept; + template + inline batch, A> select(batch_bool const&, batch, A> const&, batch, A> const&) noexcept; + template + inline batch sign(batch const& self) noexcept; + template + inline batch signnz(batch const& self) noexcept; + template + inline batch sin(batch const& self) noexcept; + template + inline batch sinh(batch const& self) noexcept; + template + inline std::pair, batch> sincos(batch const& self) noexcept; + template + inline batch sqrt(batch const& self) noexcept; + template + inline batch tan(batch const& self) noexcept; + template + inline batch, A> to_float(batch const& self) noexcept; + template + inline batch, A> to_int(batch const& self) noexcept; + template + inline batch trunc(batch const& self) noexcept; + + namespace kernel + { + + namespace detail + { + template + inline batch apply(F&& func, batch const& self, batch const& other) noexcept + { + constexpr std::size_t size = batch::size; + alignas(A::alignment()) T self_buffer[size]; + alignas(A::alignment()) T other_buffer[size]; + self.store_aligned(&self_buffer[0]); + other.store_aligned(&other_buffer[0]); + for (std::size_t i = 0; i < size; ++i) + { + self_buffer[i] = func(self_buffer[i], other_buffer[i]); + } + return batch::load_aligned(self_buffer); + } + + template + inline batch apply_transform(F&& func, batch const& self) noexcept + { + static_assert(batch::size == batch::size, + "Source and destination sizes must match"); + constexpr std::size_t src_size = batch::size; + constexpr std::size_t dest_size = batch::size; + alignas(A::alignment()) T self_buffer[src_size]; + alignas(A::alignment()) U other_buffer[dest_size]; + self.store_aligned(&self_buffer[0]); + for (std::size_t i = 0; i < src_size; ++i) + { + other_buffer[i] = func(self_buffer[i]); + } + return batch::load_aligned(other_buffer); + } + } + + namespace detail + { + // Generic conversion handling machinery. Each architecture must define + // conversion function when such conversions exits in the form of + // intrinsic. Then we use that information to automatically decide whether + // to use scalar or vector conversion when doing load / store / batch_cast + struct with_fast_conversion + { + }; + struct with_slow_conversion + { + }; + + template + struct conversion_type_impl + { + using type = with_slow_conversion; + }; + + using xsimd::detail::void_t; + + template + struct conversion_type_impl&>(), + std::declval&>(), + std::declval()))>> + { + using type = with_fast_conversion; + }; + + template + using conversion_type = typename conversion_type_impl::type; + } + + namespace detail + { + /* origin: boost/simdfunction/horn.hpp*/ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline B coef() noexcept + { + using value_type = typename B::value_type; + return B(bit_cast(as_unsigned_integer_t(c))); + } + template + inline B horner(const B&) noexcept + { + return B(typename B::value_type(0.)); + } + + template + inline B horner(const B&) noexcept + { + return coef(); + } + + template + inline B horner(const B& self) noexcept + { + return fma(self, horner(self), coef()); + } + + /* origin: boost/simdfunction/horn1.hpp*/ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline B horner1(const B&) noexcept + { + return B(1.); + } + + template + inline B horner1(const B& x) noexcept + { + return x + detail::coef(); + } + + template + inline B horner1(const B& x) noexcept + { + return fma(x, horner1(x), detail::coef()); + } + } + + } + +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_logical.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_logical.hpp new file mode 100644 index 0000000000..dd446e83dd --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_logical.hpp @@ -0,0 +1,163 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_GENERIC_LOGICAL_HPP +#define XSIMD_GENERIC_LOGICAL_HPP + +#include "./xsimd_generic_details.hpp" + +namespace xsimd +{ + + namespace kernel + { + + using namespace types; + + // from mask + template + inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + alignas(A::alignment()) bool buffer[batch_bool::size]; + // This is inefficient but should never be called. It's just a + // temporary implementation until arm support is added. + for (size_t i = 0; i < batch_bool::size; ++i) + buffer[i] = mask & (1ull << i); + return batch_bool::load_aligned(buffer); + } + + // ge + template + inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return other <= self; + } + + // gt + template + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return other < self; + } + + // is_even + template + inline batch_bool is_even(batch const& self, requires_arch) noexcept + { + return is_flint(self * T(0.5)); + } + + // is_flint + template + inline batch_bool is_flint(batch const& self, requires_arch) noexcept + { + auto frac = select(isnan(self - self), constants::nan>(), self - trunc(self)); + return frac == T(0.); + } + + // is_odd + template + inline batch_bool is_odd(batch const& self, requires_arch) noexcept + { + return is_even(self - T(1.)); + } + + // isinf + template ::value, void>::type> + inline batch_bool isinf(batch const&, requires_arch) noexcept + { + return batch_bool(false); + } + template + inline batch_bool isinf(batch const& self, requires_arch) noexcept + { + return abs(self) == std::numeric_limits::infinity(); + } + template + inline batch_bool isinf(batch const& self, requires_arch) noexcept + { + return abs(self) == std::numeric_limits::infinity(); + } + + // isfinite + template ::value, void>::type> + inline batch_bool isfinite(batch const&, requires_arch) noexcept + { + return batch_bool(true); + } + template + inline batch_bool isfinite(batch const& self, requires_arch) noexcept + { + return (self - self) == 0.f; + } + template + inline batch_bool isfinite(batch const& self, requires_arch) noexcept + { + return (self - self) == 0.; + } + + // isnan + template ::value, void>::type> + inline batch_bool isnan(batch const&, requires_arch) noexcept + { + return batch_bool(false); + } + + // le + template ::value, void>::type> + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return (self < other) || (self == other); + } + + // neq + template + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return !(other == self); + } + + // logical_and + template + inline batch logical_and(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::apply([](T x, T y) noexcept + { return x && y; }, + self, other); + } + + // logical_or + template + inline batch logical_or(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::apply([](T x, T y) noexcept + { return x || y; }, + self, other); + } + + // mask + template + inline uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + alignas(A::alignment()) bool buffer[batch_bool::size]; + self.store_aligned(buffer); + // This is inefficient but should never be called. It's just a + // temporary implementation until arm support is added. + uint64_t res = 0; + for (size_t i = 0; i < batch_bool::size; ++i) + if (buffer[i]) + res |= 1ul << i; + return res; + } + } +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp new file mode 100644 index 0000000000..ea2f1567e4 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp @@ -0,0 +1,2418 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_GENERIC_MATH_HPP +#define XSIMD_GENERIC_MATH_HPP + +#include "../xsimd_scalar.hpp" +#include "./xsimd_generic_details.hpp" +#include "./xsimd_generic_trigo.hpp" + +#include + +namespace xsimd +{ + + namespace kernel + { + + using namespace types; + // abs + template ::value, void>::type*/> + inline batch abs(batch const& self, requires_arch) noexcept + { + if (std::is_unsigned::value) + return self; + else + { + auto sign = bitofsign(self); + auto inv = self ^ sign; + return inv - sign; + } + } + + template + inline batch abs(batch, A> const& z, requires_arch) noexcept + { + return hypot(z.real(), z.imag()); + } + + // batch_cast + template + inline batch batch_cast(batch const& self, batch const&, requires_arch) noexcept + { + return self; + } + + namespace detail + { + template + inline batch batch_cast(batch const& self, batch const& out, requires_arch, with_fast_conversion) noexcept + { + return fast_cast(self, out, A {}); + } + template + inline batch batch_cast(batch const& self, batch const&, requires_arch, with_slow_conversion) noexcept + { + static_assert(!std::is_same::value, "there should be no conversion for this type combination"); + using batch_type_in = batch; + using batch_type_out = batch; + static_assert(batch_type_in::size == batch_type_out::size, "compatible sizes"); + alignas(A::alignment()) T_in buffer_in[batch_type_in::size]; + alignas(A::alignment()) T_out buffer_out[batch_type_out::size]; + self.store_aligned(&buffer_in[0]); + std::copy(std::begin(buffer_in), std::end(buffer_in), std::begin(buffer_out)); + return batch_type_out::load_aligned(buffer_out); + } + + } + + template + inline batch batch_cast(batch const& self, batch const& out, requires_arch) noexcept + { + return detail::batch_cast(self, out, A {}, detail::conversion_type {}); + } + + // bitofsign + template + inline batch bitofsign(batch const& self, requires_arch) noexcept + { + static_assert(std::is_integral::value, "int type implementation"); + if (std::is_unsigned::value) + return batch(0); + else + return self >> (T)(8 * sizeof(T) - 1); + } + + template + inline batch bitofsign(batch const& self, requires_arch) noexcept + { + return self & constants::minuszero>(); + } + template + inline batch bitofsign(batch const& self, requires_arch) noexcept + { + return self & constants::minuszero>(); + } + + // bitwise_cast + template + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return self; + } + + // cbrt + /* origin: boost/simd/arch/common/simd/function/cbrt.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch cbrt(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type z = abs(self); +#ifndef XSIMD_NO_DENORMALS + auto denormal = z < constants::smallestposval(); + z = select(denormal, z * constants::twotonmb(), z); + batch_type f = select(denormal, constants::twotonmbo3(), batch_type(1.)); +#endif + const batch_type CBRT2(bit_cast(0x3fa14518)); + const batch_type CBRT4(bit_cast(0x3fcb2ff5)); + const batch_type CBRT2I(bit_cast(0x3f4b2ff5)); + const batch_type CBRT4I(bit_cast(0x3f214518)); + using i_type = as_integer_t; + i_type e; + batch_type x = frexp(z, e); + x = detail::horner(x); + auto flag = e >= i_type(0); + i_type e1 = abs(e); + i_type rem = e1; + e1 /= i_type(3); + rem -= e1 * i_type(3); + e = e1 * sign(e); + const batch_type cbrt2 = select(batch_bool_cast(flag), CBRT2, CBRT2I); + const batch_type cbrt4 = select(batch_bool_cast(flag), CBRT4, CBRT4I); + batch_type fact = select(batch_bool_cast(rem == i_type(1)), cbrt2, batch_type(1.)); + fact = select(batch_bool_cast(rem == i_type(2)), cbrt4, fact); + x = ldexp(x * fact, e); + x -= (x - z / (x * x)) * batch_type(1.f / 3.f); +#ifndef XSIMD_NO_DENORMALS + x = (x | bitofsign(self)) * f; +#else + x = x | bitofsign(self); +#endif +#ifndef XSIMD_NO_INFINITIES + return select(self == batch_type(0.) || isinf(self), self, x); +#else + return select(self == batch_type(0.), self, x); +#endif + } + + template + inline batch cbrt(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type z = abs(self); +#ifndef XSIMD_NO_DENORMALS + auto denormal = z < constants::smallestposval(); + z = select(denormal, z * constants::twotonmb(), z); + batch_type f = select(denormal, constants::twotonmbo3(), batch_type(1.)); +#endif + const batch_type CBRT2(bit_cast(int64_t(0x3ff428a2f98d728b))); + const batch_type CBRT4(bit_cast(int64_t(0x3ff965fea53d6e3d))); + const batch_type CBRT2I(bit_cast(int64_t(0x3fe965fea53d6e3d))); + const batch_type CBRT4I(bit_cast(int64_t(0x3fe428a2f98d728b))); + using i_type = as_integer_t; + i_type e; + batch_type x = frexp(z, e); + x = detail::horner(x); + auto flag = e >= typename i_type::value_type(0); + i_type e1 = abs(e); + i_type rem = e1; + e1 /= i_type(3); + rem -= e1 * i_type(3); + e = e1 * sign(e); + const batch_type cbrt2 = select(batch_bool_cast(flag), CBRT2, CBRT2I); + const batch_type cbrt4 = select(batch_bool_cast(flag), CBRT4, CBRT4I); + batch_type fact = select(batch_bool_cast(rem == i_type(1)), cbrt2, batch_type(1.)); + fact = select(batch_bool_cast(rem == i_type(2)), cbrt4, fact); + x = ldexp(x * fact, e); + x -= (x - z / (x * x)) * batch_type(1. / 3.); + x -= (x - z / (x * x)) * batch_type(1. / 3.); +#ifndef XSIMD_NO_DENORMALS + x = (x | bitofsign(self)) * f; +#else + x = x | bitofsign(self); +#endif +#ifndef XSIMD_NO_INFINITIES + return select(self == batch_type(0.) || isinf(self), self, x); +#else + return select(self == batch_type(0.), self, x); +#endif + } + + // clip + template + inline batch clip(batch const& self, batch const& lo, batch const& hi, requires_arch) noexcept + { + return min(hi, max(self, lo)); + } + + // copysign + template ::value, void>::type> + inline batch copysign(batch const& self, batch const& other, requires_arch) noexcept + { + return abs(self) | bitofsign(other); + } + + // erf + + namespace detail + { + /* origin: boost/simd/arch/common/detail/generic/erf_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + struct erf_kernel; + + template + struct erf_kernel> + { + using batch_type = batch; + // computes erf(a0)/a0 + // x is sqr(a0) and 0 <= abs(a0) <= 2/3 + static inline batch_type erf1(const batch_type& x) noexcept + { + return detail::horner(x); + } + + // computes erfc(x)*exp(sqr(x)) + // x >= 2/3 + static inline batch_type erfc2(const batch_type& x) noexcept + { + return detail::horner(x); + } + + static inline batch_type erfc3(const batch_type& x) noexcept + { + return (batch_type(1.) - x) * detail::horner(x); + } + }; + + template + struct erf_kernel> + { + using batch_type = batch; + // computes erf(a0)/a0 + // x is sqr(a0) and 0 <= abs(a0) <= 0.65 + static inline batch_type erf1(const batch_type& x) noexcept + { + return detail::horner(x) + / detail::horner(x); + } + + // computes erfc(x)*exp(x*x) + // 0.65 <= abs(x) <= 2.2 + static inline batch_type erfc2(const batch_type& x) noexcept + { + return detail::horner(x) + / detail::horner(x); + } + + // computes erfc(x)*exp(x*x) + // 2.2 <= abs(x) <= 6 + static inline batch_type erfc3(const batch_type& x) noexcept + { + return detail::horner(x) + / detail::horner(x); + } + + // computes erfc(rx)*exp(rx*rx) + // x >= 6 rx = 1/x + static inline batch_type erfc4(const batch_type& x) noexcept + { + return detail::horner(x); + } + }; + } + /* origin: boost/simd/arch/common/simd/function/erf.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + + template + inline batch erf(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + batch_type r1(0.); + auto test1 = x < batch_type(2.f / 3.f); + if (any(test1)) + { + r1 = self * detail::erf_kernel::erf1(x * x); + if (all(test1)) + return r1; + } + batch_type z = x / (batch_type(1.) + x); + z -= batch_type(0.4f); + batch_type r2 = batch_type(1.) - exp(-x * x) * detail::erf_kernel::erfc2(z); + r2 = select(self < batch_type(0.), -r2, r2); + r1 = select(test1, r1, r2); +#ifndef XSIMD_NO_INFINITIES + r1 = select(xsimd::isinf(self), sign(self), r1); +#endif + return r1; + } + + template + inline batch erf(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + batch_type xx = x * x; + batch_type lim1(0.65); + batch_type lim2(2.2); + auto test1 = x < lim1; + batch_type r1(0.); + if (any(test1)) + { + r1 = self * detail::erf_kernel::erf1(xx); + if (all(test1)) + return r1; + } + auto test2 = x < lim2; + auto test3 = test2 && !test1; + batch_type ex = exp(-xx); + if (any(test3)) + { + batch_type z = batch_type(1.) - ex * detail::erf_kernel::erfc2(x); + batch_type r2 = select(self < batch_type(0.), -z, z); + r1 = select(test1, r1, r2); + if (all(test1 || test3)) + return r1; + } + batch_type z = batch_type(1.) - ex * detail::erf_kernel::erfc3(x); + z = select(self < batch_type(0.), -z, z); +#ifndef XSIMD_NO_INFINITIES + z = select(xsimd::isinf(self), sign(self), z); +#endif + return select(test2, r1, z); + } + + // erfc + template + inline batch erfc(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + auto test0 = self < batch_type(0.); + batch_type r1(0.); + batch_type z = x / (batch_type(1.) + x); + if (any(3.f * x < 2.f)) + { + r1 = detail::erf_kernel::erfc3(z); + } + else + { + z -= batch_type(0.4f); + r1 = exp(-x * x) * detail::erf_kernel::erfc2(z); + } +#ifndef XSIMD_NO_INFINITIES + r1 = select(x == constants::infinity(), batch_type(0.), r1); +#endif + return select(test0, batch_type(2.) - r1, r1); + } + + template + inline batch erfc(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + batch_type xx = x * x; + batch_type lim1(0.65); + batch_type lim2(2.2); + auto test0 = self < batch_type(0.); + auto test1 = x < lim1; + batch_type r1(0.); + if (any(test1)) + { + r1 = batch_type(1.) - x * detail::erf_kernel::erf1(xx); + if (all(test1)) + return select(test0, batch_type(2.) - r1, r1); + } + auto test2 = x < lim2; + auto test3 = test2 && !test1; + batch_type ex = exp(-xx); + if (any(test3)) + { + batch_type z = ex * detail::erf_kernel::erfc2(x); + r1 = select(test1, r1, z); + if (all(test1 || test3)) + return select(test0, batch_type(2.) - r1, r1); + } + batch_type z = ex * detail::erf_kernel::erfc3(x); + r1 = select(test2, r1, z); +#ifndef XSIMD_NO_INFINITIES + r1 = select(x == constants::infinity(), batch_type(0.), r1); +#endif + return select(test0, batch_type(2.) - r1, r1); + } + + // estrin + namespace detail + { + + template + struct estrin + { + B x; + + template + inline B operator()(const Ts&... coefs) noexcept + { + return eval(coefs...); + } + + private: + inline B eval(const B& c0) noexcept + { + return c0; + } + + inline B eval(const B& c0, const B& c1) noexcept + { + return fma(x, c1, c0); + } + + template + inline B eval(::xsimd::detail::index_sequence, const Tuple& tuple) + { + return estrin { x * x }(std::get(tuple)...); + } + + template + inline B eval(const std::tuple& tuple) noexcept + { + return eval(::xsimd::detail::make_index_sequence(), tuple); + } + + template + inline B eval(const std::tuple& tuple, const B& c0) noexcept + { + return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0)))); + } + + template + inline B eval(const std::tuple& tuple, const B& c0, const B& c1) noexcept + { + return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1)))); + } + + template + inline B eval(const std::tuple& tuple, const B& c0, const B& c1, const Ts&... coefs) noexcept + { + return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1))), coefs...); + } + + template + inline B eval(const B& c0, const B& c1, const Ts&... coefs) noexcept + { + return eval(std::make_tuple(eval(c0, c1)), coefs...); + } + }; + } + + template + inline batch estrin(const batch& self) noexcept + { + using batch_type = batch; + return detail::estrin { self }(detail::coef()...); + } + + // exp + /* origin: boost/simd/arch/common/detail/simd/expo_base.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + namespace detail + { + enum exp_reduction_tag + { + exp_tag, + exp2_tag, + exp10_tag + }; + + template + struct exp_reduction_base; + + template + struct exp_reduction_base + { + static constexpr B maxlog() noexcept + { + return constants::maxlog(); + } + + static constexpr B minlog() noexcept + { + return constants::minlog(); + } + }; + + template + struct exp_reduction_base + { + static constexpr B maxlog() noexcept + { + return constants::maxlog10(); + } + + static constexpr B minlog() noexcept + { + return constants::minlog10(); + } + }; + + template + struct exp_reduction_base + { + static constexpr B maxlog() noexcept + { + return constants::maxlog2(); + } + + static constexpr B minlog() noexcept + { + return constants::minlog2(); + } + }; + + template + struct exp_reduction; + + template + struct exp_reduction : exp_reduction_base, exp_tag> + { + using batch_type = batch; + static inline batch_type approx(const batch_type& x) noexcept + { + batch_type y = detail::horner(x); + return ++fma(y, x * x, x); + } + + static inline batch_type reduce(const batch_type& a, batch_type& x) noexcept + { + batch_type k = nearbyint(constants::invlog_2() * a); + x = fnma(k, constants::log_2hi(), a); + x = fnma(k, constants::log_2lo(), x); + return k; + } + }; + + template + struct exp_reduction : exp_reduction_base, exp10_tag> + { + using batch_type = batch; + static inline batch_type approx(const batch_type& x) noexcept + { + return ++(detail::horner(x) + * x); + } + + static inline batch_type reduce(const batch_type& a, batch_type& x) noexcept + { + batch_type k = nearbyint(constants::invlog10_2() * a); + x = fnma(k, constants::log10_2hi(), a); + x -= k * constants::log10_2lo(); + return k; + } + }; + + template + struct exp_reduction : exp_reduction_base, exp2_tag> + { + using batch_type = batch; + static inline batch_type approx(const batch_type& x) noexcept + { + batch_type y = detail::horner(x); + return ++fma(y, x * x, x * constants::log_2()); + } + + static inline batch_type reduce(const batch_type& a, batch_type& x) noexcept + { + batch_type k = nearbyint(a); + x = (a - k); + return k; + } + }; + + template + struct exp_reduction : exp_reduction_base, exp_tag> + { + using batch_type = batch; + static inline batch_type approx(const batch_type& x) noexcept + { + batch_type t = x * x; + return fnma(t, + detail::horner(t), + x); + } + + static inline batch_type reduce(const batch_type& a, batch_type& hi, batch_type& lo, batch_type& x) noexcept + { + batch_type k = nearbyint(constants::invlog_2() * a); + hi = fnma(k, constants::log_2hi(), a); + lo = k * constants::log_2lo(); + x = hi - lo; + return k; + } + + static inline batch_type finalize(const batch_type& x, const batch_type& c, const batch_type& hi, const batch_type& lo) noexcept + { + return batch_type(1.) - (((lo - (x * c) / (batch_type(2.) - c)) - hi)); + } + }; + + template + struct exp_reduction : exp_reduction_base, exp10_tag> + { + using batch_type = batch; + static inline batch_type approx(const batch_type& x) noexcept + { + batch_type xx = x * x; + batch_type px = x * detail::horner(xx); + batch_type x2 = px / (detail::horner1(xx) - px); + return ++(x2 + x2); + } + + static inline batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept + { + batch_type k = nearbyint(constants::invlog10_2() * a); + x = fnma(k, constants::log10_2hi(), a); + x = fnma(k, constants::log10_2lo(), x); + return k; + } + + static inline batch_type finalize(const batch_type&, const batch_type& c, const batch_type&, const batch_type&) noexcept + { + return c; + } + }; + + template + struct exp_reduction : exp_reduction_base, exp2_tag> + { + using batch_type = batch; + static inline batch_type approx(const batch_type& x) noexcept + { + batch_type t = x * x; + return fnma(t, + detail::horner(t), + x); + } + + static inline batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept + { + batch_type k = nearbyint(a); + x = (a - k) * constants::log_2(); + return k; + } + + static inline batch_type finalize(const batch_type& x, const batch_type& c, const batch_type&, const batch_type&) noexcept + { + return batch_type(1.) + x + x * c / (batch_type(2.) - c); + } + }; + + template + inline batch exp(batch const& self) noexcept + { + using batch_type = batch; + using reducer_t = exp_reduction; + batch_type x; + batch_type k = reducer_t::reduce(self, x); + x = reducer_t::approx(x); + x = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(x, to_int(k))); + x = select(self >= reducer_t::maxlog(), constants::infinity(), x); + return x; + } + + template + inline batch exp(batch const& self) noexcept + { + using batch_type = batch; + using reducer_t = exp_reduction; + batch_type hi, lo, x; + batch_type k = reducer_t::reduce(self, hi, lo, x); + batch_type c = reducer_t::approx(x); + c = reducer_t::finalize(x, c, hi, lo); + c = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(c, to_int(k))); + c = select(self >= reducer_t::maxlog(), constants::infinity(), c); + return c; + } + } + + template + inline batch exp(batch const& self, requires_arch) noexcept + { + return detail::exp(self); + } + + template + inline batch, A> exp(batch, A> const& self, requires_arch) noexcept + { + using batch_type = batch, A>; + auto isincos = sincos(self.imag()); + return exp(self.real()) * batch_type(std::get<1>(isincos), std::get<0>(isincos)); + } + + // exp10 + template + inline batch exp10(batch const& self, requires_arch) noexcept + { + return detail::exp(self); + } + + // exp2 + template + inline batch exp2(batch const& self, requires_arch) noexcept + { + return detail::exp(self); + } + + // expm1 + namespace detail + { + /* origin: boost/simd/arch/common/detail/generic/expm1_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + static inline batch expm1(const batch& a) noexcept + { + using batch_type = batch; + batch_type k = nearbyint(constants::invlog_2() * a); + batch_type x = fnma(k, constants::log_2hi(), a); + x = fnma(k, constants::log_2lo(), x); + batch_type hx = x * batch_type(0.5); + batch_type hxs = x * hx; + batch_type r = detail::horner(hxs); + batch_type t = fnma(r, hx, batch_type(3.)); + batch_type e = hxs * ((r - t) / (batch_type(6.) - x * t)); + e = fms(x, e, hxs); + using i_type = as_integer_t; + i_type ik = to_int(k); + batch_type two2mk = ::xsimd::bitwise_cast((constants::maxexponent() - ik) << constants::nmb()); + batch_type y = batch_type(1.) - two2mk - (e - x); + return ldexp(y, ik); + } + + template + static inline batch expm1(const batch& a) noexcept + { + using batch_type = batch; + batch_type k = nearbyint(constants::invlog_2() * a); + batch_type hi = fnma(k, constants::log_2hi(), a); + batch_type lo = k * constants::log_2lo(); + batch_type x = hi - lo; + batch_type hxs = x * x * batch_type(0.5); + batch_type r = detail::horner(hxs); + batch_type t = batch_type(3.) - r * batch_type(0.5) * x; + batch_type e = hxs * ((r - t) / (batch_type(6) - x * t)); + batch_type c = (hi - x) - lo; + e = (x * (e - c) - c) - hxs; + using i_type = as_integer_t; + i_type ik = to_int(k); + batch_type two2mk = ::xsimd::bitwise_cast((constants::maxexponent() - ik) << constants::nmb()); + batch_type ct1 = batch_type(1.) - two2mk - (e - x); + batch_type ct2 = ++(x - (e + two2mk)); + batch_type y = select(k < batch_type(20.), ct1, ct2); + return ldexp(y, ik); + } + + } + + template + inline batch expm1(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + return select(self < constants::logeps(), + batch_type(-1.), + select(self > constants::maxlog(), + constants::infinity(), + detail::expm1(self))); + } + + template + inline batch, A> expm1(const batch, A>& z, requires_arch) noexcept + { + using batch_type = batch, A>; + using real_batch = typename batch_type::real_batch; + real_batch isin = sin(z.imag()); + real_batch rem1 = expm1(z.real()); + real_batch re = rem1 + 1.; + real_batch si = sin(z.imag() * 0.5); + return { rem1 - 2. * re * si * si, re * isin }; + } + + // polar + template + inline batch, A> polar(const batch& r, const batch& theta, requires_arch) noexcept + { + auto sincosTheta = sincos(theta); + return { r * sincosTheta.second, r * sincosTheta.first }; + } + + // fdim + template + inline batch fdim(batch const& self, batch const& other, requires_arch) noexcept + { + return fmax(batch(0), self - other); + } + + // fmod + template + inline batch fmod(batch const& self, batch const& other, requires_arch) noexcept + { + return fnma(trunc(self / other), other, self); + } + + // frexp + /* origin: boost/simd/arch/common/simd/function/ifrexp.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch frexp(const batch& self, batch, A>& exp, requires_arch) noexcept + { + using batch_type = batch; + using int_type = as_integer_t; + using i_type = batch; + i_type m1f = constants::mask1frexp(); + i_type r1 = m1f & ::xsimd::bitwise_cast(self); + batch_type x = self & ::xsimd::bitwise_cast(~m1f); + exp = (r1 >> constants::nmb()) - constants::maxexponentm1(); + exp = select(batch_bool_cast(self != batch_type(0.)), exp, i_type(typename i_type::value_type(0))); + return select((self != batch_type(0.)), x | ::xsimd::bitwise_cast(constants::mask2frexp()), batch_type(0.)); + } + + // from bool + template + inline batch from_bool(batch_bool const& self, requires_arch) noexcept + { + return batch(self.data) & batch(1); + } + + // horner + template + inline batch horner(const batch& self) noexcept + { + return detail::horner, Coefs...>(self); + } + + // hypot + template + inline batch hypot(batch const& self, batch const& other, requires_arch) noexcept + { + return sqrt(fma(self, self, other * other)); + } + + // ipow + template + inline batch ipow(batch const& self, ITy other, requires_arch) noexcept + { + return ::xsimd::detail::ipow(self, other); + } + + // ldexp + /* origin: boost/simd/arch/common/simd/function/ldexp.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch ldexp(const batch& self, const batch, A>& other, requires_arch) noexcept + { + using batch_type = batch; + using itype = as_integer_t; + itype ik = other + constants::maxexponent(); + ik = ik << constants::nmb(); + return self * ::xsimd::bitwise_cast(ik); + } + + // lgamma + template + inline batch lgamma(batch const& self, requires_arch) noexcept; + + namespace detail + { + /* origin: boost/simd/arch/common/detail/generic/gammaln_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + static inline batch gammalnB(const batch& x) noexcept + { + return horner, + 0x3ed87730, // 4.227843421859038E-001 + 0x3ea51a64, // 3.224669577325661E-001, + 0xbd89f07e, // -6.735323259371034E-002, + 0x3ca89ed8, // 2.058355474821512E-002, + 0xbbf164fd, // -7.366775108654962E-003, + 0x3b3ba883, // 2.863437556468661E-003, + 0xbaabeab1, // -1.311620815545743E-003, + 0x3a1ebb94 // 6.055172732649237E-004 + >(x); + } + + template + static inline batch gammalnC(const batch& x) noexcept + { + return horner, + 0xbf13c468, // -5.772156501719101E-001 + 0x3f528d34, // 8.224670749082976E-001, + 0xbecd27a8, // -4.006931650563372E-001, + 0x3e8a898b, // 2.705806208275915E-001, + 0xbe53c04f, // -2.067882815621965E-001, + 0x3e2d4dab, // 1.692415923504637E-001, + 0xbe22d329, // -1.590086327657347E-001, + 0x3e0c3c4f // 1.369488127325832E-001 + >(x); + } + + template + static inline batch gammaln2(const batch& x) noexcept + { + return horner, + 0x3daaaa94, // 8.333316229807355E-002f + 0xbb358701, // -2.769887652139868E-003f, + 0x3a31fd69 // 6.789774945028216E-004f + >(x); + } + + template + static inline batch gammaln1(const batch& x) noexcept + { + return horner, + 0xc12a0c675418055eull, // -8.53555664245765465627E5 + 0xc13a45890219f20bull, // -1.72173700820839662146E6, + 0xc131bc82f994db51ull, // -1.16237097492762307383E6, + 0xc1143d73f89089e5ull, // -3.31612992738871184744E5, + 0xc0e2f234355bb93eull, // -3.88016315134637840924E4, + 0xc09589018ff36761ull // -1.37825152569120859100E3 + >(x) + / horner, + 0xc13ece4b6a11e14aull, // -2.01889141433532773231E6 + 0xc1435255892ff34cull, // -2.53252307177582951285E6, + 0xc131628671950043ull, // -1.13933444367982507207E6, + 0xc10aeb84b9744c9bull, // -2.20528590553854454839E5, + 0xc0d0aa0d7b89d757ull, // -1.70642106651881159223E4, + 0xc075fd0d1cf312b2ull, // -3.51815701436523470549E2, + 0x3ff0000000000000ull // 1.00000000000000000000E0 + >(x); + } + + template + static inline batch gammalnA(const batch& x) noexcept + { + return horner, + 0x3fb555555555554bull, // 8.33333333333331927722E-2 + 0xbf66c16c16b0a5a1ull, // -2.77777777730099687205E-3, + 0x3f4a019f20dc5ebbull, // 7.93650340457716943945E-4, + 0xbf437fbdb580e943ull, // -5.95061904284301438324E-4, + 0x3f4a985027336661ull // 8.11614167470508450300E-4 + >(x); + } + + /* origin: boost/simd/arch/common/simd/function/gammaln.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + struct lgamma_impl; + + template + struct lgamma_impl> + { + using batch_type = batch; + static inline batch_type compute(const batch_type& a) noexcept + { + auto inf_result = (a <= batch_type(0.)) && is_flint(a); + batch_type x = select(inf_result, constants::nan(), a); + batch_type q = abs(x); +#ifndef XSIMD_NO_INFINITIES + inf_result = (x == constants::infinity()) || inf_result; +#endif + auto ltza = a < batch_type(0.); + batch_type r; + batch_type r1 = other(q); + if (any(ltza)) + { + r = select(inf_result, constants::infinity(), negative(q, r1)); + if (all(ltza)) + return r; + } + batch_type r2 = select(ltza, r, r1); + return select(a == constants::minusinfinity(), constants::nan(), select(inf_result, constants::infinity(), r2)); + } + + private: + static inline batch_type negative(const batch_type& q, const batch_type& w) noexcept + { + batch_type p = floor(q); + batch_type z = q - p; + auto test2 = z < batch_type(0.5); + z = select(test2, z - batch_type(1.), z); + z = q * sin(z, trigo_pi_tag()); + return -log(constants::invpi() * abs(z)) - w; + } + + static inline batch_type other(const batch_type& x) noexcept + { + auto xlt650 = (x < batch_type(6.5)); + batch_type r0x = x; + batch_type r0z = x; + batch_type r0s = batch_type(1.); + batch_type r1 = batch_type(0.); + batch_type p = constants::nan(); + if (any(xlt650)) + { + batch_type z = batch_type(1.); + batch_type tx = select(xlt650, x, batch_type(0.)); + batch_type nx = batch_type(0.); + const batch_type _075 = batch_type(0.75); + const batch_type _150 = batch_type(1.50); + const batch_type _125 = batch_type(1.25); + const batch_type _250 = batch_type(2.50); + auto xge150 = (x >= _150); + auto txgt250 = (tx > _250); + + // x >= 1.5 + while (any(xge150 && txgt250)) + { + nx = select(txgt250, nx - batch_type(1.), nx); + tx = select(txgt250, x + nx, tx); + z = select(txgt250, z * tx, z); + txgt250 = (tx > _250); + } + r0x = select(xge150, x + nx - batch_type(2.), x); + r0z = select(xge150, z, r0z); + r0s = select(xge150, batch_type(1.), r0s); + + // x >= 1.25 && x < 1.5 + auto xge125 = (x >= _125); + auto xge125t = xge125 && !xge150; + if (any(xge125)) + { + r0x = select(xge125t, x - batch_type(1.), r0x); + r0z = select(xge125t, z * x, r0z); + r0s = select(xge125t, batch_type(-1.), r0s); + } + + // x >= 0.75 && x < 1.5 + batch_bool kernelC(false); + auto xge075 = (x >= _075); + auto xge075t = xge075 && !xge125; + if (any(xge075t)) + { + kernelC = xge075t; + r0x = select(xge075t, x - batch_type(1.), x); + r0z = select(xge075t, batch_type(1.), r0z); + r0s = select(xge075t, batch_type(-1.), r0s); + p = gammalnC(r0x); + } + + // tx < 1.5 && x < 0.75 + auto txlt150 = (tx < _150) && !xge075; + if (any(txlt150)) + { + auto orig = txlt150; + while (any(txlt150)) + { + z = select(txlt150, z * tx, z); + nx = select(txlt150, nx + batch_type(1.), nx); + tx = select(txlt150, x + nx, tx); + txlt150 = (tx < _150) && !xge075; + } + r0x = select(orig, r0x + nx - batch_type(2.), r0x); + r0z = select(orig, z, r0z); + r0s = select(orig, batch_type(-1.), r0s); + } + p = select(kernelC, p, gammalnB(r0x)); + if (all(xlt650)) + return fma(r0x, p, r0s * log(abs(r0z))); + } + r0z = select(xlt650, abs(r0z), x); + batch_type m = log(r0z); + r1 = fma(r0x, p, r0s * m); + batch_type r2 = fma(x - batch_type(0.5), m, constants::logsqrt2pi() - x); + r2 += gammaln2(batch_type(1.) / (x * x)) / x; + return select(xlt650, r1, r2); + } + }; + + template + struct lgamma_impl> + { + using batch_type = batch; + + static inline batch_type compute(const batch_type& a) noexcept + { + auto inf_result = (a <= batch_type(0.)) && is_flint(a); + batch_type x = select(inf_result, constants::nan(), a); + batch_type q = abs(x); +#ifndef XSIMD_NO_INFINITIES + inf_result = (q == constants::infinity()); +#endif + auto test = (a < batch_type(-34.)); + batch_type r = constants::nan(); + if (any(test)) + { + r = large_negative(q); + if (all(test)) + return select(inf_result, constants::nan(), r); + } + batch_type r1 = other(a); + batch_type r2 = select(test, r, r1); + return select(a == constants::minusinfinity(), constants::nan(), select(inf_result, constants::infinity(), r2)); + } + + private: + static inline batch_type large_negative(const batch_type& q) noexcept + { + batch_type w = lgamma(q); + batch_type p = floor(q); + batch_type z = q - p; + auto test2 = (z < batch_type(0.5)); + z = select(test2, z - batch_type(1.), z); + z = q * sin(z, trigo_pi_tag()); + z = abs(z); + return constants::logpi() - log(z) - w; + } + + static inline batch_type other(const batch_type& xx) noexcept + { + batch_type x = xx; + auto test = (x < batch_type(13.)); + batch_type r1 = batch_type(0.); + if (any(test)) + { + batch_type z = batch_type(1.); + batch_type p = batch_type(0.); + batch_type u = select(test, x, batch_type(0.)); + auto test1 = (u >= batch_type(3.)); + while (any(test1)) + { + p = select(test1, p - batch_type(1.), p); + u = select(test1, x + p, u); + z = select(test1, z * u, z); + test1 = (u >= batch_type(3.)); + } + + auto test2 = (u < batch_type(2.)); + while (any(test2)) + { + z = select(test2, z / u, z); + p = select(test2, p + batch_type(1.), p); + u = select(test2, x + p, u); + test2 = (u < batch_type(2.)); + } + + z = abs(z); + x += p - batch_type(2.); + r1 = x * gammaln1(x) + log(z); + if (all(test)) + return r1; + } + batch_type r2 = fma(xx - batch_type(0.5), log(xx), constants::logsqrt2pi() - xx); + batch_type p = batch_type(1.) / (xx * xx); + r2 += gammalnA(p) / xx; + return select(test, r1, r2); + } + }; + } + + template + inline batch lgamma(batch const& self, requires_arch) noexcept + { + return detail::lgamma_impl>::compute(self); + } + + // log + /* origin: boost/simd/arch/common/simd/function/log.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch log(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + using int_type = as_integer_t; + using i_type = batch; + batch_type x = self; + i_type k(0); + auto isnez = (self != batch_type(0.)); +#ifndef XSIMD_NO_DENORMALS + auto test = (self < constants::smallestposval()) && isnez; + if (any(test)) + { + k = select(batch_bool_cast(test), k - i_type(23), k); + x = select(test, x * batch_type(8388608ul), x); + } +#endif + i_type ix = ::xsimd::bitwise_cast(x); + ix += 0x3f800000 - 0x3f3504f3; + k += (ix >> 23) - 0x7f; + ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; + x = ::xsimd::bitwise_cast(ix); + batch_type f = --x; + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type dk = to_float(k); + batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo()) - hfsq + f); +#ifndef XSIMD_NO_INFINITIES + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); +#else + batch_type zz = select(isnez, r, constants::minusinfinity()); +#endif + return select(!(self >= batch_type(0.)), constants::nan(), zz); + } + + template + inline batch log(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + using int_type = as_integer_t; + using i_type = batch; + + batch_type x = self; + i_type hx = ::xsimd::bitwise_cast(x) >> 32; + i_type k(0); + auto isnez = (self != batch_type(0.)); +#ifndef XSIMD_NO_DENORMALS + auto test = (self < constants::smallestposval()) && isnez; + if (any(test)) + { + k = select(batch_bool_cast(test), k - i_type(54), k); + x = select(test, x * batch_type(18014398509481984ull), x); + } +#endif + hx += 0x3ff00000 - 0x3fe6a09e; + k += (hx >> 20) - 0x3ff; + batch_type dk = to_float(k); + hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; + x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); + + batch_type f = --x; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo()) - hfsq + f); +#ifndef XSIMD_NO_INFINITIES + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); +#else + batch_type zz = select(isnez, r, constants::minusinfinity()); +#endif + return select(!(self >= batch_type(0.)), constants::nan(), zz); + } + + template + inline batch, A> log(const batch, A>& z, requires_arch) noexcept + { + return batch, A>(log(abs(z)), atan2(z.imag(), z.real())); + } + + // log2 + template + inline batch log2(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + using int_type = as_integer_t; + using i_type = batch; + batch_type x = self; + i_type k(0); + auto isnez = (self != batch_type(0.)); +#ifndef XSIMD_NO_DENORMALS + auto test = (self < constants::smallestposval()) && isnez; + if (any(test)) + { + k = select(batch_bool_cast(test), k - i_type(25), k); + x = select(test, x * batch_type(33554432ul), x); + } +#endif + i_type ix = ::xsimd::bitwise_cast(x); + ix += 0x3f800000 - 0x3f3504f3; + k += (ix >> 23) - 0x7f; + ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; + x = ::xsimd::bitwise_cast(ix); + batch_type f = --x; + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t1 + t2; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type dk = to_float(k); + batch_type r = fma(fms(s, hfsq + R, hfsq) + f, constants::invlog_2(), dk); +#ifndef XSIMD_NO_INFINITIES + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); +#else + batch_type zz = select(isnez, r, constants::minusinfinity()); +#endif + return select(!(self >= batch_type(0.)), constants::nan(), zz); + } + + template + inline batch log2(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + using int_type = as_integer_t; + using i_type = batch; + batch_type x = self; + i_type hx = ::xsimd::bitwise_cast(x) >> 32; + i_type k(0); + auto isnez = (self != batch_type(0.)); +#ifndef XSIMD_NO_DENORMALS + auto test = (self < constants::smallestposval()) && isnez; + if (any(test)) + { + k = select(batch_bool_cast(test), k - i_type(54), k); + x = select(test, x * batch_type(18014398509481984ull), x); + } +#endif + hx += 0x3ff00000 - 0x3fe6a09e; + k += (hx >> 20) - 0x3ff; + hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; + x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); + batch_type f = --x; + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type hi = f - hfsq; + hi = hi & ::xsimd::bitwise_cast((constants::allbits() << 32)); + batch_type lo = fma(s, hfsq + R, f - hi - hfsq); + batch_type val_hi = hi * constants::invlog_2hi(); + batch_type val_lo = fma(lo + hi, constants::invlog_2lo(), lo * constants::invlog_2hi()); + batch_type dk = to_float(k); + batch_type w1 = dk + val_hi; + val_lo += (dk - w1) + val_hi; + val_hi = w1; + batch_type r = val_lo + val_hi; +#ifndef XSIMD_NO_INFINITIES + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); +#else + batch_type zz = select(isnez, r, constants::minusinfinity()); +#endif + return select(!(self >= batch_type(0.)), constants::nan(), zz); + } + + namespace detail + { + template + inline batch logN_complex_impl(const batch& z, typename batch::value_type base) noexcept + { + using batch_type = batch; + using rv_type = typename batch_type::value_type; + return log(z) / batch_type(rv_type(base)); + } + } + + template + inline batch, A> log2(batch, A> const& self, requires_arch) noexcept + { + return detail::logN_complex_impl(self, std::log(2)); + } + + // log10 + /* origin: FreeBSD /usr/src/lib/msun/src/e_log10f.c */ + /* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + template + inline batch log10(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + const batch_type + ivln10hi(4.3432617188e-01f), + ivln10lo(-3.1689971365e-05f), + log10_2hi(3.0102920532e-01f), + log10_2lo(7.9034151668e-07f); + using int_type = as_integer_t; + using i_type = batch; + batch_type x = self; + i_type k(0); + auto isnez = (self != batch_type(0.)); +#ifndef XSIMD_NO_DENORMALS + auto test = (self < constants::smallestposval()) && isnez; + if (any(test)) + { + k = select(batch_bool_cast(test), k - i_type(25), k); + x = select(test, x * batch_type(33554432ul), x); + } +#endif + i_type ix = ::xsimd::bitwise_cast(x); + ix += 0x3f800000 - 0x3f3504f3; + k += (ix >> 23) - 0x7f; + ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; + x = ::xsimd::bitwise_cast(ix); + batch_type f = --x; + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type dk = to_float(k); + batch_type hfsq = batch_type(0.5) * f * f; + batch_type hibits = f - hfsq; + hibits &= ::xsimd::bitwise_cast(i_type(0xfffff000)); + batch_type lobits = fma(s, hfsq + R, f - hibits - hfsq); + batch_type r = fma(dk, log10_2hi, + fma(hibits, ivln10hi, + fma(lobits, ivln10hi, + fma(lobits + hibits, ivln10lo, dk * log10_2lo)))); +#ifndef XSIMD_NO_INFINITIES + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); +#else + batch_type zz = select(isnez, r, constants::minusinfinity()); +#endif + return select(!(self >= batch_type(0.)), constants::nan(), zz); + } + + template + inline batch log10(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + const batch_type + ivln10hi(4.34294481878168880939e-01), + ivln10lo(2.50829467116452752298e-11), + log10_2hi(3.01029995663611771306e-01), + log10_2lo(3.69423907715893078616e-13); + using int_type = as_integer_t; + using i_type = batch; + batch_type x = self; + i_type hx = ::xsimd::bitwise_cast(x) >> 32; + i_type k(0); + auto isnez = (self != batch_type(0.)); +#ifndef XSIMD_NO_DENORMALS + auto test = (self < constants::smallestposval()) && isnez; + if (any(test)) + { + k = select(batch_bool_cast(test), k - i_type(54), k); + x = select(test, x * batch_type(18014398509481984ull), x); + } +#endif + hx += 0x3ff00000 - 0x3fe6a09e; + k += (hx >> 20) - 0x3ff; + hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; + x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); + batch_type f = --x; + batch_type dk = to_float(k); + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type hi = f - hfsq; + hi = hi & ::xsimd::bitwise_cast(constants::allbits() << 32); + batch_type lo = f - hi - hfsq + s * (hfsq + R); + batch_type val_hi = hi * ivln10hi; + batch_type y = dk * log10_2hi; + batch_type val_lo = dk * log10_2lo + (lo + hi) * ivln10lo + lo * ivln10hi; + batch_type w1 = y + val_hi; + val_lo += (y - w1) + val_hi; + val_hi = w1; + batch_type r = val_lo + val_hi; +#ifndef XSIMD_NO_INFINITIES + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); +#else + batch_type zz = select(isnez, r, constants::minusinfinity()); +#endif + return select(!(self >= batch_type(0.)), constants::nan(), zz); + } + + template + inline batch, A> log10(const batch, A>& z, requires_arch) noexcept + { + return detail::logN_complex_impl(z, std::log(10)); + } + + // log1p + /* origin: boost/simd/arch/common/simd/function/log1p.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch log1p(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + using int_type = as_integer_t; + using i_type = batch; + const batch_type uf = self + batch_type(1.); + auto isnez = (uf != batch_type(0.)); + i_type iu = ::xsimd::bitwise_cast(uf); + iu += 0x3f800000 - 0x3f3504f3; + i_type k = (iu >> 23) - 0x7f; + iu = (iu & i_type(0x007fffff)) + 0x3f3504f3; + batch_type f = --(::xsimd::bitwise_cast(iu)); + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type dk = to_float(k); + /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */ + batch_type c = select(batch_bool_cast(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf; + batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo() + c) - hfsq + f); +#ifndef XSIMD_NO_INFINITIES + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); +#else + batch_type zz = select(isnez, r, constants::minusinfinity()); +#endif + return select(!(uf >= batch_type(0.)), constants::nan(), zz); + } + + template + inline batch log1p(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + using int_type = as_integer_t; + using i_type = batch; + const batch_type uf = self + batch_type(1.); + auto isnez = (uf != batch_type(0.)); + i_type hu = ::xsimd::bitwise_cast(uf) >> 32; + hu += 0x3ff00000 - 0x3fe6a09e; + i_type k = (hu >> 20) - 0x3ff; + /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */ + batch_type c = select(batch_bool_cast(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf; + hu = (hu & i_type(0x000fffff)) + 0x3fe6a09e; + batch_type f = ::xsimd::bitwise_cast((hu << 32) | (i_type(0xffffffff) & ::xsimd::bitwise_cast(uf))); + f = --f; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type dk = to_float(k); + batch_type r = fma(dk, constants::log_2hi(), fma(s, hfsq + R, dk * constants::log_2lo() + c) - hfsq + f); +#ifndef XSIMD_NO_INFINITIES + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); +#else + batch_type zz = select(isnez, r, constants::minusinfinity()); +#endif + return select(!(uf >= batch_type(0.)), constants::nan(), zz); + } + + template + inline batch, A> log1p(batch, A> const& self, requires_arch) noexcept + { + using batch_type = batch, A>; + using real_batch = typename batch_type::real_batch; + batch_type u = 1 + self; + batch_type logu = log(u); + return select(u == batch_type(1.), + self, + select(u.real() <= real_batch(0.), + logu, + logu * self / (u - batch_type(1.)))); + } + + // mod + template ::value, void>::type> + inline batch mod(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::apply([](T x, T y) noexcept -> T + { return x % y; }, + self, other); + } + + // nearbyint + template ::value, void>::type> + inline batch nearbyint(batch const& self, requires_arch) noexcept + { + return self; + } + namespace detail + { + template + inline batch nearbyintf(batch const& self) noexcept + { + using batch_type = batch; + batch_type s = bitofsign(self); + batch_type v = self ^ s; + batch_type t2n = constants::twotonmb(); + // Under fast-math, reordering is possible and the compiler optimizes d + // to v. That's not what we want, so prevent compiler optimization here. + // FIXME: it may be better to emit a memory barrier here (?). +#ifdef __FAST_MATH__ + volatile batch_type d0 = v + t2n; + batch_type d = *(batch_type*)(void*)(&d0) - t2n; +#else + batch_type d0 = v + t2n; + batch_type d = d0 - t2n; +#endif + return s ^ select(v < t2n, d, v); + } + } + template + inline batch nearbyint(batch const& self, requires_arch) noexcept + { + return detail::nearbyintf(self); + } + template + inline batch nearbyint(batch const& self, requires_arch) noexcept + { + return detail::nearbyintf(self); + } + + // nearbyint_as_int + template ::value, void>::type> + inline batch nearbyint_as_int(batch const& self, requires_arch) noexcept + { + return self; + } + + // nearbyint_as_int + template + inline batch, A> + nearbyint_as_int(batch const& self, requires_arch) noexcept + { + using U = as_integer_t; + return kernel::detail::apply_transform([](float x) noexcept -> U + { return std::lroundf(x); }, + self); + } + + template + inline batch, A> + nearbyint_as_int(batch const& self, requires_arch) noexcept + { + using U = as_integer_t; + return kernel::detail::apply_transform([](double x) noexcept -> U + { return std::llround(x); }, + self); + } + + // nextafter + namespace detail + { + template ::value> + struct nextafter_kernel + { + using batch_type = batch; + + static inline batch_type next(batch_type const& b) noexcept + { + return b; + } + + static inline batch_type prev(batch_type const& b) noexcept + { + return b; + } + }; + + template + struct bitwise_cast_batch; + + template + struct bitwise_cast_batch + { + using type = batch; + }; + + template + struct bitwise_cast_batch + { + using type = batch; + }; + + template + struct nextafter_kernel + { + using batch_type = batch; + using int_batch = typename bitwise_cast_batch::type; + using int_type = typename int_batch::value_type; + + static inline batch_type next(const batch_type& b) noexcept + { + batch_type n = ::xsimd::bitwise_cast(::xsimd::bitwise_cast(b) + int_type(1)); + return select(b == constants::infinity(), b, n); + } + + static inline batch_type prev(const batch_type& b) noexcept + { + batch_type p = ::xsimd::bitwise_cast(::xsimd::bitwise_cast(b) - int_type(1)); + return select(b == constants::minusinfinity(), b, p); + } + }; + } + template + inline batch nextafter(batch const& from, batch const& to, requires_arch) noexcept + { + using kernel = detail::nextafter_kernel; + return select(from == to, from, + select(to > from, kernel::next(from), kernel::prev(from))); + } + + // pow + /* origin: boost/simd/arch/common/simd/function/pow.hpp*/ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch pow(batch const& self, batch const& other, requires_arch) noexcept + { + using batch_type = batch; + const auto zero = batch_type(0.); + auto negx = self < zero; + auto iszero = self == zero; + constexpr T e = static_cast(2.718281828459045); + auto adj_self = select(iszero, batch_type(e), abs(self)); + batch_type z = exp(other * log(adj_self)); + z = select(iszero, zero, z); + z = select(is_odd(other) && negx, -z, z); + auto invalid = negx && !(is_flint(other) || isinf(other)); + return select(invalid, constants::nan(), z); + } + + template + inline batch, A> pow(const batch, A>& a, const batch, A>& z, requires_arch) noexcept + { + using cplx_batch = batch, A>; + using real_batch = typename cplx_batch::real_batch; + real_batch absa = abs(a); + real_batch arga = arg(a); + real_batch x = z.real(); + real_batch y = z.imag(); + real_batch r = pow(absa, x); + real_batch theta = x * arga; + real_batch ze(0); + auto cond = (y == ze); + r = select(cond, r, r * exp(-y * arga)); + theta = select(cond, theta, theta + y * log(absa)); + return select(absa == ze, cplx_batch(ze), cplx_batch(r * cos(theta), r * sin(theta))); + } + + // reciprocal + template ::value, void>::type> + inline batch reciprocal(batch const& self, + requires_arch) noexcept + { + using batch_type = batch; + return div(batch_type(1), self); + } + + // reduce_add + template + inline std::complex reduce_add(batch, A> const& self, requires_arch) noexcept + { + return { reduce_add(self.real()), reduce_add(self.imag()) }; + } + + namespace detail + { + template + struct split_high + { + static constexpr T get(T i, T) + { + return i >= N ? (i % 2) : i + N; + } + }; + + template + inline T reduce(Op, batch const& self, std::integral_constant) noexcept + { + return self.get(0); + } + + template + inline T reduce(Op op, batch const& self, std::integral_constant) noexcept + { + using index_type = as_unsigned_integer_t; + batch split = swizzle(self, make_batch_constant, split_high>()); + return reduce(op, op(split, self), std::integral_constant()); + } + } + + // reduce_max + template + inline T reduce_max(batch const& self, requires_arch) noexcept + { + return detail::reduce([](batch const& x, batch const& y) + { return max(x, y); }, + self, std::integral_constant::size>()); + } + + // reduce_min + template + inline T reduce_min(batch const& self, requires_arch) noexcept + { + return detail::reduce([](batch const& x, batch const& y) + { return min(x, y); }, + self, std::integral_constant::size>()); + } + + // remainder + template + inline batch remainder(batch const& self, batch const& other, requires_arch) noexcept + { + return fnma(nearbyint(self / other), other, self); + } + template + inline batch remainder(batch const& self, batch const& other, requires_arch) noexcept + { + return fnma(nearbyint(self / other), other, self); + } + template ::value, void>::type> + inline batch remainder(batch const& self, batch const& other, requires_arch) noexcept + { + auto mod = self % other; + return select(mod <= other / 2, mod, mod - other); + } + + // select + template + inline batch, A> select(batch_bool const& cond, batch, A> const& true_br, batch, A> const& false_br, requires_arch) noexcept + { + return { select(cond, true_br.real(), false_br.real()), select(cond, true_br.imag(), false_br.imag()) }; + } + + // sign + template ::value, void>::type> + inline batch sign(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type res = select(self > batch_type(0), batch_type(1), batch_type(0)) - select(self < batch_type(0), batch_type(1), batch_type(0)); + return res; + } + + namespace detail + { + template + inline batch signf(batch const& self) noexcept + { + using batch_type = batch; + batch_type res = select(self > batch_type(0.f), batch_type(1.f), batch_type(0.f)) - select(self < batch_type(0.f), batch_type(1.f), batch_type(0.f)); +#ifdef XSIMD_NO_NANS + return res; +#else + return select(isnan(self), constants::nan(), res); +#endif + } + } + + template + inline batch sign(batch const& self, requires_arch) noexcept + { + return detail::signf(self); + } + template + inline batch sign(batch const& self, requires_arch) noexcept + { + return detail::signf(self); + } + template + inline batch, A> sign(const batch, A>& z, requires_arch) noexcept + { + using batch_type = batch, A>; + using real_batch = typename batch_type::real_batch; + auto rz = z.real(); + auto iz = z.imag(); + return select(rz != real_batch(0.), + batch_type(sign(rz)), + batch_type(sign(iz))); + } + + // signnz + template ::value, void>::type> + inline batch signnz(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + return (self >> (sizeof(T) * 8 - 1)) | batch_type(1.); + } + + namespace detail + { + template + inline batch signnzf(batch const& self) noexcept + { + using batch_type = batch; +#ifndef XSIMD_NO_NANS + return select(isnan(self), constants::nan(), batch_type(1.) | (constants::signmask() & self)); +#else + return batch_type(1.) | (constants::signmask() & self); +#endif + } + } + + template + inline batch signnz(batch const& self, requires_arch) noexcept + { + return detail::signnzf(self); + } + template + inline batch signnz(batch const& self, requires_arch) noexcept + { + return detail::signnzf(self); + } + + // sqrt + template + inline batch, A> sqrt(batch, A> const& z, requires_arch) noexcept + { + + constexpr T csqrt_scale_factor = std::is_same::value ? 6.7108864e7f : 1.8014398509481984e16; + constexpr T csqrt_scale = std::is_same::value ? 1.220703125e-4f : 7.450580596923828125e-9; + using batch_type = batch, A>; + using real_batch = batch; + real_batch x = z.real(); + real_batch y = z.imag(); + real_batch sqrt_x = sqrt(fabs(x)); + real_batch sqrt_hy = sqrt(0.5 * fabs(y)); + auto cond = (fabs(x) > real_batch(4.) || fabs(y) > real_batch(4.)); + x = select(cond, x * 0.25, x * csqrt_scale_factor); + y = select(cond, y * 0.25, y * csqrt_scale_factor); + real_batch scale = select(cond, real_batch(2.), real_batch(csqrt_scale)); + real_batch r = abs(batch_type(x, y)); + + auto condxp = x > real_batch(0.); + real_batch t0 = select(condxp, xsimd::sqrt(0.5 * (r + x)), xsimd::sqrt(0.5 * (r - x))); + real_batch r0 = scale * fabs((0.5 * y) / t0); + t0 *= scale; + real_batch t = select(condxp, t0, r0); + r = select(condxp, r0, t0); + batch_type resg = select(y < real_batch(0.), batch_type(t, -r), batch_type(t, r)); + real_batch ze(0.); + + return select(y == ze, + select(x == ze, + batch_type(ze, ze), + select(x < ze, batch_type(ze, sqrt_x), batch_type(sqrt_x, ze))), + select(x == ze, + select(y > ze, batch_type(sqrt_hy, sqrt_hy), batch_type(sqrt_hy, -sqrt_hy)), + resg)); + } + + // tgamma + + namespace detail + { + /* origin: boost/simd/arch/common/detail/generic/stirling_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + struct stirling_kernel; + + template + struct stirling_kernel> + { + using batch_type = batch; + static inline batch_type compute(const batch_type& x) noexcept + { + return horner(x); + } + + static inline batch_type split_limit() noexcept + { + return batch_type(bit_cast(uint32_t(0x41d628f6))); + } + + static inline batch_type large_limit() noexcept + { + return batch_type(bit_cast(uint32_t(0x420c28f3))); + } + }; + + template + struct stirling_kernel> + { + using batch_type = batch; + static inline batch_type compute(const batch_type& x) noexcept + { + return horner(x); + } + + static inline batch_type split_limit() noexcept + { + return batch_type(bit_cast(uint64_t(0x4061e083ba3443d4))); + } + + static inline batch_type large_limit() noexcept + { + return batch_type(bit_cast(uint64_t(0x4065800000000000))); + } + }; + + /* origin: boost/simd/arch/common/simd/function/stirling.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch stirling(const batch& a) noexcept + { + using batch_type = batch; + const batch_type stirlingsplitlim = stirling_kernel::split_limit(); + const batch_type stirlinglargelim = stirling_kernel::large_limit(); + batch_type x = select(a >= batch_type(0.), a, constants::nan()); + batch_type w = batch_type(1.) / x; + w = fma(w, stirling_kernel::compute(w), batch_type(1.)); + batch_type y = exp(-x); + auto test = (x < stirlingsplitlim); + batch_type z = x - batch_type(0.5); + z = select(test, z, batch_type(0.5) * z); + batch_type v = exp(z * log(abs(x))); + y *= v; + y = select(test, y, y * v); + y *= constants::sqrt_2pi() * w; +#ifndef XSIMD_NO_INFINITIES + y = select(isinf(x), x, y); +#endif + return select(x > stirlinglargelim, constants::infinity(), y); + } + + /* origin: boost/simd/arch/common/detail/generic/gamma_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + struct tgamma_kernel; + + template + struct tgamma_kernel> + { + using batch_type = batch; + static inline batch_type compute(const batch_type& x) noexcept + { + return horner(x); + } + }; + + template + struct tgamma_kernel> + { + using batch_type = batch; + static inline batch_type compute(const batch_type& x) noexcept + { + return horner(x) + / horner(x); + } + }; + + /* origin: boost/simd/arch/common/simd/function/gamma.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline B tgamma_large_negative(const B& a) noexcept + { + B st = stirling(a); + B p = floor(a); + B sgngam = select(is_even(p), -B(1.), B(1.)); + B z = a - p; + auto test2 = z < B(0.5); + z = select(test2, z - B(1.), z); + z = a * sin(z, trigo_pi_tag()); + z = abs(z); + return sgngam * constants::pi() / (z * st); + } + + template + inline B tgamma_other(const B& a, const BB& test) noexcept + { + B x = select(test, B(2.), a); +#ifndef XSIMD_NO_INFINITIES + auto inf_result = (a == constants::infinity()); + x = select(inf_result, B(2.), x); +#endif + B z = B(1.); + auto test1 = (x >= B(3.)); + while (any(test1)) + { + x = select(test1, x - B(1.), x); + z = select(test1, z * x, z); + test1 = (x >= B(3.)); + } + test1 = (x < B(0.)); + while (any(test1)) + { + z = select(test1, z / x, z); + x = select(test1, x + B(1.), x); + test1 = (x < B(0.)); + } + auto test2 = (x < B(2.)); + while (any(test2)) + { + z = select(test2, z / x, z); + x = select(test2, x + B(1.), x); + test2 = (x < B(2.)); + } + x = z * tgamma_kernel::compute(x - B(2.)); +#ifndef XSIMD_NO_INFINITIES + return select(inf_result, a, x); +#else + return x; +#endif + } + } + + template + inline batch tgamma(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + auto nan_result = (self < batch_type(0.) && is_flint(self)); +#ifndef XSIMD_NO_INVALIDS + nan_result = isnan(self) || nan_result; +#endif + batch_type q = abs(self); + auto test = (self < batch_type(-33.)); + batch_type r = constants::nan(); + if (any(test)) + { + r = detail::tgamma_large_negative(q); + if (all(test)) + return select(nan_result, constants::nan(), r); + } + batch_type r1 = detail::tgamma_other(self, test); + batch_type r2 = select(test, r, r1); + return select(self == batch_type(0.), copysign(constants::infinity(), self), select(nan_result, constants::nan(), r2)); + } + + } + +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp new file mode 100644 index 0000000000..bb40ddffc6 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp @@ -0,0 +1,397 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_GENERIC_MEMORY_HPP +#define XSIMD_GENERIC_MEMORY_HPP + +#include +#include +#include + +#include "../../types/xsimd_batch_constant.hpp" +#include "./xsimd_generic_details.hpp" + +namespace xsimd +{ + template + struct batch_constant; + + namespace kernel + { + + using namespace types; + + // extract_pair + template + inline batch extract_pair(batch const& self, batch const& other, std::size_t i, requires_arch) noexcept + { + constexpr std::size_t size = batch::size; + assert(i < size && "index in bounds"); + + alignas(A::alignment()) T self_buffer[size]; + self.store_aligned(self_buffer); + + alignas(A::alignment()) T other_buffer[size]; + other.store_aligned(other_buffer); + + alignas(A::alignment()) T concat_buffer[size]; + + for (std::size_t j = 0; j < (size - i); ++j) + { + concat_buffer[j] = other_buffer[i + j]; + if (j < i) + { + concat_buffer[size - 1 - j] = self_buffer[i - 1 - j]; + } + } + return batch::load_aligned(concat_buffer); + } + + // gather + namespace detail + { + template ::type = 0> + inline batch gather(U const* src, batch const& index, + ::xsimd::index I) noexcept + { + return insert(batch {}, static_cast(src[index.get(I)]), I); + } + + template ::type = 0> + inline batch + gather(U const* src, batch const& index, ::xsimd::index I) noexcept + { + static_assert(N <= batch::size, "Incorrect value in recursion!"); + + const auto test = gather(src, index, {}); + return insert(test, static_cast(src[index.get(I)]), I); + } + } // namespace detail + + template + inline batch + gather(batch const&, T const* src, batch const& index, + kernel::requires_arch) noexcept + { + static_assert(batch::size == batch::size, + "Index and destination sizes must match"); + + return detail::gather::size - 1, T, A>(src, index, {}); + } + + // Gather with runtime indexes and mismatched strides. + template + inline detail::sizes_mismatch_t> + gather(batch const&, U const* src, batch const& index, + kernel::requires_arch) noexcept + { + static_assert(batch::size == batch::size, + "Index and destination sizes must match"); + + return detail::gather::size - 1, T, A>(src, index, {}); + } + + // Gather with runtime indexes and matching strides. + template + inline detail::stride_match_t> + gather(batch const&, U const* src, batch const& index, + kernel::requires_arch) noexcept + { + static_assert(batch::size == batch::size, + "Index and destination sizes must match"); + + return batch_cast(kernel::gather(batch {}, src, index, A {})); + } + + // insert + template + inline batch insert(batch const& self, T val, index, requires_arch) noexcept + { + struct index_mask + { + static constexpr bool get(size_t index, size_t /* size*/) + { + return index != I; + } + }; + batch tmp(val); + return select(make_batch_bool_constant, index_mask>(), self, tmp); + } + + // get + template + inline T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + alignas(A::alignment()) T buffer[batch::size]; + self.store_aligned(&buffer[0]); + return buffer[I]; + } + + template + inline T get(batch_bool const& self, ::xsimd::index, requires_arch) noexcept + { + alignas(A::alignment()) T buffer[batch_bool::size]; + self.store_aligned(&buffer[0]); + return buffer[I]; + } + + template + inline auto get(batch, A> const& self, ::xsimd::index, requires_arch) noexcept -> typename batch, A>::value_type + { + alignas(A::alignment()) T buffer[batch, A>::size]; + self.store_aligned(&buffer[0]); + return buffer[I]; + } + + template + inline T get(batch const& self, std::size_t i, requires_arch) noexcept + { + alignas(A::alignment()) T buffer[batch::size]; + self.store_aligned(&buffer[0]); + return buffer[i]; + } + + template + inline T get(batch_bool const& self, std::size_t i, requires_arch) noexcept + { + alignas(A::alignment()) bool buffer[batch_bool::size]; + self.store_aligned(&buffer[0]); + return buffer[i]; + } + + template + inline auto get(batch, A> const& self, std::size_t i, requires_arch) noexcept -> typename batch, A>::value_type + { + using T2 = typename batch, A>::value_type; + alignas(A::alignment()) T2 buffer[batch, A>::size]; + self.store_aligned(&buffer[0]); + return buffer[i]; + } + + // load_aligned + namespace detail + { + template + inline batch load_aligned(T_in const* mem, convert, requires_arch, with_fast_conversion) noexcept + { + using batch_type_in = batch; + using batch_type_out = batch; + return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {}); + } + template + inline batch load_aligned(T_in const* mem, convert, requires_arch, with_slow_conversion) noexcept + { + static_assert(!std::is_same::value, "there should be a direct load for this type combination"); + using batch_type_out = batch; + alignas(A::alignment()) T_out buffer[batch_type_out::size]; + std::copy(mem, mem + batch_type_out::size, std::begin(buffer)); + return batch_type_out::load_aligned(buffer); + } + } + template + inline batch load_aligned(T_in const* mem, convert cvt, requires_arch) noexcept + { + return detail::load_aligned(mem, cvt, A {}, detail::conversion_type {}); + } + + // load_unaligned + namespace detail + { + template + inline batch load_unaligned(T_in const* mem, convert, requires_arch, with_fast_conversion) noexcept + { + using batch_type_in = batch; + using batch_type_out = batch; + return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A {}); + } + + template + inline batch load_unaligned(T_in const* mem, convert cvt, requires_arch, with_slow_conversion) noexcept + { + static_assert(!std::is_same::value, "there should be a direct load for this type combination"); + return load_aligned(mem, cvt, generic {}, with_slow_conversion {}); + } + } + template + inline batch load_unaligned(T_in const* mem, convert cvt, requires_arch) noexcept + { + return detail::load_unaligned(mem, cvt, generic {}, detail::conversion_type {}); + } + + namespace detail + { + // Scatter with runtime indexes. + template ::type = 0> + inline void scatter(batch const& src, U* dst, + batch const& index, + ::xsimd::index I) noexcept + { + dst[index.get(I)] = static_cast(src.get(I)); + } + + template ::type = 0> + inline void + scatter(batch const& src, U* dst, batch const& index, + ::xsimd::index I) noexcept + { + static_assert(N <= batch::size, "Incorrect value in recursion!"); + + kernel::detail::scatter( + src, dst, index, {}); + dst[index.get(I)] = static_cast(src.get(I)); + } + } // namespace detail + + template + inline void + scatter(batch const& src, T* dst, + batch const& index, + kernel::requires_arch) noexcept + { + static_assert(batch::size == batch::size, + "Source and index sizes must match"); + kernel::detail::scatter::size - 1, T, A, T, V>( + src, dst, index, {}); + } + + template + inline detail::sizes_mismatch_t + scatter(batch const& src, U* dst, + batch const& index, + kernel::requires_arch) noexcept + { + static_assert(batch::size == batch::size, + "Source and index sizes must match"); + kernel::detail::scatter::size - 1, T, A, U, V>( + src, dst, index, {}); + } + + template + inline detail::stride_match_t + scatter(batch const& src, U* dst, + batch const& index, + kernel::requires_arch) noexcept + { + static_assert(batch::size == batch::size, + "Source and index sizes must match"); + const auto tmp = batch_cast(src); + kernel::scatter(tmp, dst, index, A {}); + } + + // store + template + inline void store(batch_bool const& self, bool* mem, requires_arch) noexcept + { + using batch_type = batch; + constexpr auto size = batch_bool::size; + alignas(A::alignment()) T buffer[size]; + kernel::store_aligned(&buffer[0], batch_type(self), A {}); + for (std::size_t i = 0; i < size; ++i) + mem[i] = bool(buffer[i]); + } + + // store_aligned + template + inline void store_aligned(T_out* mem, batch const& self, requires_arch) noexcept + { + static_assert(!std::is_same::value, "there should be a direct store for this type combination"); + alignas(A::alignment()) T_in buffer[batch::size]; + store_aligned(&buffer[0], self); + std::copy(std::begin(buffer), std::end(buffer), mem); + } + + // store_unaligned + template + inline void store_unaligned(T_out* mem, batch const& self, requires_arch) noexcept + { + static_assert(!std::is_same::value, "there should be a direct store for this type combination"); + return store_aligned(mem, self, generic {}); + } + + // swizzle + template + inline batch, A> swizzle(batch, A> const& self, batch_constant, Vs...> mask, requires_arch) noexcept + { + return { swizzle(self.real(), mask), swizzle(self.imag(), mask) }; + } + + namespace detail + { + template + inline batch, A> load_complex(batch const& /*hi*/, batch const& /*lo*/, requires_arch) noexcept + { + static_assert(std::is_same::value, "load_complex not implemented for the required architecture"); + } + + template + inline batch complex_high(batch, A> const& /*src*/, requires_arch) noexcept + { + static_assert(std::is_same::value, "complex_high not implemented for the required architecture"); + } + + template + inline batch complex_low(batch, A> const& /*src*/, requires_arch) noexcept + { + static_assert(std::is_same::value, "complex_low not implemented for the required architecture"); + } + } + + // load_complex_aligned + template + inline batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) noexcept + { + using real_batch = batch; + T_in const* buffer = reinterpret_cast(mem); + real_batch hi = real_batch::load_aligned(buffer), + lo = real_batch::load_aligned(buffer + real_batch::size); + return detail::load_complex(hi, lo, A {}); + } + + // load_complex_unaligned + template + inline batch, A> load_complex_unaligned(std::complex const* mem, convert>, requires_arch) noexcept + { + using real_batch = batch; + T_in const* buffer = reinterpret_cast(mem); + real_batch hi = real_batch::load_unaligned(buffer), + lo = real_batch::load_unaligned(buffer + real_batch::size); + return detail::load_complex(hi, lo, A {}); + } + + // store_complex_aligned + template + inline void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept + { + using real_batch = batch; + real_batch hi = detail::complex_high(src, A {}); + real_batch lo = detail::complex_low(src, A {}); + T_out* buffer = reinterpret_cast(dst); + lo.store_aligned(buffer); + hi.store_aligned(buffer + real_batch::size); + } + + // store_compelx_unaligned + template + inline void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept + { + using real_batch = batch; + real_batch hi = detail::complex_high(src, A {}); + real_batch lo = detail::complex_low(src, A {}); + T_out* buffer = reinterpret_cast(dst); + lo.store_unaligned(buffer); + hi.store_unaligned(buffer + real_batch::size); + } + + } + +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_rounding.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_rounding.hpp new file mode 100644 index 0000000000..b6a79a4515 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_rounding.hpp @@ -0,0 +1,72 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_GENERIC_ROUNDING_HPP +#define XSIMD_GENERIC_ROUNDING_HPP + +#include "./xsimd_generic_details.hpp" + +namespace xsimd +{ + + namespace kernel + { + + using namespace types; + + // ceil + template + inline batch ceil(batch const& self, requires_arch) noexcept + { + batch truncated_self = trunc(self); + return select(truncated_self < self, truncated_self + 1, truncated_self); + } + + // floor + template + inline batch floor(batch const& self, requires_arch) noexcept + { + batch truncated_self = trunc(self); + return select(truncated_self > self, truncated_self - 1, truncated_self); + } + + // round + template + inline batch round(batch const& self, requires_arch) noexcept + { + auto v = abs(self); + auto c = ceil(v); + auto cp = select(c - 0.5 > v, c - 1, c); + return select(v > constants::maxflint>(), self, copysign(cp, self)); + } + + // trunc + template ::value, void>::type> + inline batch trunc(batch const& self, requires_arch) noexcept + { + return self; + } + template + inline batch trunc(batch const& self, requires_arch) noexcept + { + return select(abs(self) < constants::maxflint>(), to_float(to_int(self)), self); + } + template + inline batch trunc(batch const& self, requires_arch) noexcept + { + return select(abs(self) < constants::maxflint>(), to_float(to_int(self)), self); + } + + } + +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_trigo.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_trigo.hpp new file mode 100644 index 0000000000..2568a7253f --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_trigo.hpp @@ -0,0 +1,969 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_GENERIC_TRIGO_HPP +#define XSIMD_GENERIC_TRIGO_HPP + +#include "./xsimd_generic_details.hpp" + +#include + +namespace xsimd +{ + + namespace kernel + { + /* origin: boost/simd/arch/common/detail/simd/trig_base.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + + using namespace types; + + // acos + template + inline batch acos(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + auto x_larger_05 = x > batch_type(0.5); + x = select(x_larger_05, sqrt(fma(batch_type(-0.5), x, batch_type(0.5))), self); + x = asin(x); + x = select(x_larger_05, x + x, x); + x = select(self < batch_type(-0.5), constants::pi() - x, x); + return select(x_larger_05, x, constants::pio2() - x); + } + template + inline batch, A> acos(const batch, A>& z, requires_arch) noexcept + { + using batch_type = batch, A>; + using real_batch = typename batch_type::real_batch; + batch_type tmp = asin(z); + return { constants::pio2() - tmp.real(), -tmp.imag() }; + } + + // acosh + /* origin: boost/simd/arch/common/simd/function/acosh.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch acosh(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = self - batch_type(1.); + auto test = x > constants::oneotwoeps(); + batch_type z = select(test, self, x + sqrt(x + x + x * x)); + batch_type l1pz = log1p(z); + return select(test, l1pz + constants::log_2(), l1pz); + } + template + inline batch, A> acosh(const batch, A>& z, requires_arch) noexcept + { + using batch_type = batch, A>; + batch_type w = acos(z); + w = batch_type(-w.imag(), w.real()); + return w; + } + + // asin + template + inline batch asin(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + batch_type sign = bitofsign(self); + auto x_larger_05 = x > batch_type(0.5); + batch_type z = select(x_larger_05, batch_type(0.5) * (batch_type(1.) - x), x * x); + x = select(x_larger_05, sqrt(z), x); + batch_type z1 = detail::horner(z); + z1 = fma(z1, z * x, x); + z = select(x_larger_05, constants::pio2() - (z1 + z1), z1); + return z ^ sign; + } + template + inline batch asin(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + auto small_cond = x < constants::sqrteps(); + batch_type ct1 = batch_type(bit_cast(int64_t(0x3fe4000000000000))); + batch_type zz1 = batch_type(1.) - x; + batch_type vp = zz1 * detail::horner(zz1) / detail::horner1(zz1); + zz1 = sqrt(zz1 + zz1); + batch_type z = constants::pio4() - zz1; + zz1 = fms(zz1, vp, constants::pio_2lo()); + z = z - zz1; + zz1 = z + constants::pio4(); + batch_type zz2 = self * self; + z = zz2 * detail::horner(zz2) / detail::horner1(zz2); + zz2 = fma(x, z, x); + return select(x > batch_type(1.), constants::nan(), + select(small_cond, x, + select(x > ct1, zz1, zz2)) + ^ bitofsign(self)); + } + template + inline batch, A> asin(const batch, A>& z, requires_arch) noexcept + { + using batch_type = batch, A>; + using real_batch = typename batch_type::real_batch; + real_batch x = z.real(); + real_batch y = z.imag(); + + batch_type ct(-y, x); + batch_type zz(real_batch(1.) - (x - y) * (x + y), -2 * x * y); + zz = log(ct + sqrt(zz)); + batch_type resg(zz.imag(), -zz.real()); + + return select(y == real_batch(0.), + select(fabs(x) > real_batch(1.), + batch_type(constants::pio2(), real_batch(0.)), + batch_type(asin(x), real_batch(0.))), + resg); + } + + // asinh + /* origin: boost/simd/arch/common/simd/function/asinh.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + namespace detail + { + template ::value, void>::type> + inline batch + average(const batch& x1, const batch& x2) noexcept + { + return (x1 & x2) + ((x1 ^ x2) >> 1); + } + + template + inline batch + averagef(const batch& x1, const batch& x2) noexcept + { + using batch_type = batch; + return fma(x1, batch_type(0.5), x2 * batch_type(0.5)); + } + template + inline batch average(batch const& x1, batch const& x2) noexcept + { + return averagef(x1, x2); + } + template + inline batch average(batch const& x1, batch const& x2) noexcept + { + return averagef(x1, x2); + } + } + template + inline batch asinh(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + auto lthalf = x < batch_type(0.5); + batch_type x2 = x * x; + batch_type bts = bitofsign(self); + batch_type z(0.); + if (any(lthalf)) + { + z = detail::horner(x2) + * x; + if (all(lthalf)) + return z ^ bts; + } + batch_type tmp = select(x > constants::oneosqrteps(), x, detail::average(x, hypot(batch_type(1.), x))); +#ifndef XSIMD_NO_NANS + return select(isnan(self), constants::nan(), select(lthalf, z, log(tmp) + constants::log_2()) ^ bts); +#else + return select(lthalf, z, log(tmp) + constants::log_2()) ^ bts; +#endif + } + template + inline batch asinh(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + auto test = x > constants::oneosqrteps(); + batch_type z = select(test, x - batch_type(1.), x + x * x / (batch_type(1.) + hypot(batch_type(1.), x))); +#ifndef XSIMD_NO_INFINITIES + z = select(x == constants::infinity(), x, z); +#endif + batch_type l1pz = log1p(z); + z = select(test, l1pz + constants::log_2(), l1pz); + return bitofsign(self) ^ z; + } + template + inline batch, A> asinh(const batch, A>& z, requires_arch) noexcept + { + using batch_type = batch, A>; + batch_type w = asin(batch_type(-z.imag(), z.real())); + w = batch_type(w.imag(), -w.real()); + return w; + } + + // atan + namespace detail + { + template + static inline batch kernel_atan(const batch& x, const batch& recx) noexcept + { + using batch_type = batch; + const auto flag1 = x < constants::tan3pio8(); + const auto flag2 = (x >= batch_type(bit_cast((uint32_t)0x3ed413cd))) && flag1; + batch_type yy = select(flag1, batch_type(0.), constants::pio2()); + yy = select(flag2, constants::pio4(), yy); + batch_type xx = select(flag1, x, -recx); + xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx); + const batch_type z = xx * xx; + batch_type z1 = detail::horner(z); + z1 = fma(xx, z1 * z, xx); + z1 = select(flag2, z1 + constants::pio_4lo(), z1); + z1 = select(!flag1, z1 + constants::pio_2lo(), z1); + return yy + z1; + } + template + static inline batch kernel_atan(const batch& x, const batch& recx) noexcept + { + using batch_type = batch; + const auto flag1 = x < constants::tan3pio8(); + const auto flag2 = (x >= constants::tanpio8()) && flag1; + batch_type yy = select(flag1, batch_type(0.), constants::pio2()); + yy = select(flag2, constants::pio4(), yy); + batch_type xx = select(flag1, x, -recx); + xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx); + batch_type z = xx * xx; + z *= detail::horner(z) + / detail::horner1(z); + z = fma(xx, z, xx); + z = select(flag2, z + constants::pio_4lo(), z); + z = z + select(flag1, batch_type(0.), constants::pio_2lo()); + return yy + z; + } + } + template + inline batch atan(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + const batch_type absa = abs(self); + const batch_type x = detail::kernel_atan(absa, batch_type(1.) / absa); + return x ^ bitofsign(self); + } + template + inline batch, A> atan(const batch, A>& z, requires_arch) noexcept + { + using batch_type = batch, A>; + using real_batch = typename batch_type::real_batch; + real_batch x = z.real(); + real_batch y = z.imag(); + real_batch x2 = x * x; + real_batch one(1.); + real_batch a = one - x2 - (y * y); + real_batch w = 0.5 * atan2(2. * x, a); + real_batch num = y + one; + num = x2 + num * num; + real_batch den = y - one; + den = x2 + den * den; + batch_type res = select((x == real_batch(0.)) && (y == real_batch(1.)), + batch_type(real_batch(0.), constants::infinity()), + batch_type(w, 0.25 * log(num / den))); + return res; + } + + // atanh + /* origin: boost/simd/arch/common/simd/function/acosh.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch atanh(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + batch_type t = x + x; + batch_type z = batch_type(1.) - x; + auto test = x < batch_type(0.5); + batch_type tmp = select(test, x, t) / z; + return bitofsign(self) ^ (batch_type(0.5) * log1p(select(test, fma(t, tmp, t), tmp))); + } + template + inline batch, A> atanh(const batch, A>& z, requires_arch) noexcept + { + using batch_type = batch, A>; + batch_type w = atan(batch_type(-z.imag(), z.real())); + w = batch_type(w.imag(), -w.real()); + return w; + } + + // atan2 + template + inline batch atan2(batch const& self, batch const& other, requires_arch) noexcept + { + using batch_type = batch; + const batch_type q = abs(self / other); + const batch_type z = detail::kernel_atan(q, batch_type(1.) / q); + return select(other > batch_type(0.), z, constants::pi() - z) * signnz(self); + } + + // cos + namespace detail + { + template + inline batch quadrant(const batch& x) noexcept + { + return x & batch(3); + } + + template + inline batch quadrant(const batch& x) noexcept + { + return to_float(quadrant(to_int(x))); + } + + template + inline batch quadrant(const batch& x) noexcept + { + using batch_type = batch; + batch_type a = x * batch_type(0.25); + return (a - floor(a)) * batch_type(4.); + } + /* origin: boost/simd/arch/common/detail/simd/f_trig_evaluation.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + + template + inline batch cos_eval(const batch& z) noexcept + { + using batch_type = batch; + batch_type y = detail::horner(z); + return batch_type(1.) + fma(z, batch_type(-0.5), y * z * z); + } + + template + inline batch sin_eval(const batch& z, const batch& x) noexcept + { + using batch_type = batch; + batch_type y = detail::horner(z); + return fma(y * z, x, x); + } + + template + static inline batch base_tancot_eval(const batch& z) noexcept + { + using batch_type = batch; + batch_type zz = z * z; + batch_type y = detail::horner(zz); + return fma(y, zz * z, z); + } + + template + static inline batch tan_eval(const batch& z, const BB& test) noexcept + { + using batch_type = batch; + batch_type y = base_tancot_eval(z); + return select(test, y, -batch_type(1.) / y); + } + + template + static inline batch cot_eval(const batch& z, const BB& test) noexcept + { + using batch_type = batch; + batch_type y = base_tancot_eval(z); + return select(test, batch_type(1.) / y, -y); + } + + /* origin: boost/simd/arch/common/detail/simd/d_trig_evaluation.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + static inline batch cos_eval(const batch& z) noexcept + { + using batch_type = batch; + batch_type y = detail::horner(z); + return batch_type(1.) - y * z; + } + + template + static inline batch sin_eval(const batch& z, const batch& x) noexcept + { + using batch_type = batch; + batch_type y = detail::horner(z); + return fma(y * z, x, x); + } + + template + static inline batch base_tancot_eval(const batch& z) noexcept + { + using batch_type = batch; + batch_type zz = z * z; + batch_type num = detail::horner(zz); + batch_type den = detail::horner1(zz); + return fma(z, (zz * (num / den)), z); + } + + template + static inline batch tan_eval(const batch& z, const BB& test) noexcept + { + using batch_type = batch; + batch_type y = base_tancot_eval(z); + return select(test, y, -batch_type(1.) / y); + } + + template + static inline batch cot_eval(const batch& z, const BB& test) noexcept + { + using batch_type = batch; + batch_type y = base_tancot_eval(z); + return select(test, batch_type(1.) / y, -y); + } + /* origin: boost/simd/arch/common/detail/simd/trig_reduction.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + + struct trigo_radian_tag + { + }; + struct trigo_pi_tag + { + }; + + template + struct trigo_reducer + { + static inline B reduce(const B& x, B& xr) noexcept + { + if (all(x <= constants::pio4())) + { + xr = x; + return B(0.); + } + else if (all(x <= constants::pio2())) + { + auto test = x > constants::pio4(); + xr = x - constants::pio2_1(); + xr -= constants::pio2_2(); + xr -= constants::pio2_3(); + xr = select(test, xr, x); + return select(test, B(1.), B(0.)); + } + else if (all(x <= constants::twentypi())) + { + B xi = nearbyint(x * constants::twoopi()); + xr = fnma(xi, constants::pio2_1(), x); + xr -= xi * constants::pio2_2(); + xr -= xi * constants::pio2_3(); + return quadrant(xi); + } + else if (all(x <= constants::mediumpi())) + { + B fn = nearbyint(x * constants::twoopi()); + B r = x - fn * constants::pio2_1(); + B w = fn * constants::pio2_1t(); + B t = r; + w = fn * constants::pio2_2(); + r = t - w; + w = fn * constants::pio2_2t() - ((t - r) - w); + t = r; + w = fn * constants::pio2_3(); + r = t - w; + w = fn * constants::pio2_3t() - ((t - r) - w); + xr = r - w; + return quadrant(fn); + } + else + { + static constexpr std::size_t size = B::size; + using value_type = typename B::value_type; + alignas(B) std::array tmp; + alignas(B) std::array txr; + alignas(B) std::array args; + x.store_aligned(args.data()); + + for (std::size_t i = 0; i < size; ++i) + { + double arg = args[i]; + if (arg == std::numeric_limits::infinity()) + { + tmp[i] = 0.; + txr[i] = std::numeric_limits::quiet_NaN(); + } + else + { + double y[2]; + std::int32_t n = ::xsimd::detail::__ieee754_rem_pio2(arg, y); + tmp[i] = value_type(n & 3); + txr[i] = value_type(y[0]); + } + } + xr = B::load_aligned(&txr[0]); + B res = B::load_aligned(&tmp[0]); + return res; + } + } + }; + + template + struct trigo_reducer + { + static inline B reduce(const B& x, B& xr) noexcept + { + B xi = nearbyint(x * B(2.)); + B x2 = x - xi * B(0.5); + xr = x2 * constants::pi(); + return quadrant(xi); + } + }; + + } + template + inline batch cos(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + const batch_type x = abs(self); + batch_type xr = constants::nan(); + const batch_type n = detail::trigo_reducer::reduce(x, xr); + auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); + auto swap_bit = fma(batch_type(-2.), tmp, n); + auto sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask(), batch_type(0.)); + const batch_type z = xr * xr; + const batch_type se = detail::sin_eval(z, xr); + const batch_type ce = detail::cos_eval(z); + const batch_type z1 = select(swap_bit != batch_type(0.), se, ce); + return z1 ^ sign_bit; + } + + template + inline batch, A> cos(batch, A> const& z, requires_arch) noexcept + { + return { cos(z.real()) * cosh(z.imag()), -sin(z.real()) * sinh(z.imag()) }; + } + + // cosh + + /* origin: boost/simd/arch/common/simd/function/cosh.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + + template + inline batch cosh(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + auto test1 = x > (constants::maxlog() - constants::log_2()); + batch_type fac = select(test1, batch_type(0.5), batch_type(1.)); + batch_type tmp = exp(x * fac); + batch_type tmp1 = batch_type(0.5) * tmp; + return select(test1, tmp1 * tmp, detail::average(tmp, batch_type(1.) / tmp)); + } + template + inline batch, A> cosh(const batch, A>& z, requires_arch) noexcept + { + auto x = z.real(); + auto y = z.imag(); + return { cosh(x) * cos(y), sinh(x) * sin(y) }; + } + + // sin + namespace detail + { + template + inline batch sin(batch const& self, Tag = Tag()) noexcept + { + using batch_type = batch; + const batch_type x = abs(self); + batch_type xr = constants::nan(); + const batch_type n = detail::trigo_reducer::reduce(x, xr); + auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); + auto swap_bit = fma(batch_type(-2.), tmp, n); + auto sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask(), batch_type(0.)); + const batch_type z = xr * xr; + const batch_type se = detail::sin_eval(z, xr); + const batch_type ce = detail::cos_eval(z); + const batch_type z1 = select(swap_bit == batch_type(0.), se, ce); + return z1 ^ sign_bit; + } + } + + template + inline batch sin(batch const& self, requires_arch) noexcept + { + return detail::sin(self); + } + + template + inline batch, A> sin(batch, A> const& z, requires_arch) noexcept + { + return { sin(z.real()) * cosh(z.imag()), cos(z.real()) * sinh(z.imag()) }; + } + + // sincos + template + inline std::pair, batch> sincos(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + const batch_type x = abs(self); + batch_type xr = constants::nan(); + const batch_type n = detail::trigo_reducer::reduce(x, xr); + auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); + auto swap_bit = fma(batch_type(-2.), tmp, n); + const batch_type z = xr * xr; + const batch_type se = detail::sin_eval(z, xr); + const batch_type ce = detail::cos_eval(z); + auto sin_sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask(), batch_type(0.)); + const batch_type sin_z1 = select(swap_bit == batch_type(0.), se, ce); + auto cos_sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask(), batch_type(0.)); + const batch_type cos_z1 = select(swap_bit != batch_type(0.), se, ce); + return std::make_pair(sin_z1 ^ sin_sign_bit, cos_z1 ^ cos_sign_bit); + } + + template + inline std::pair, A>, batch, A>> + sincos(batch, A> const& z, requires_arch) noexcept + { + using batch_type = batch, A>; + using real_batch = typename batch_type::real_batch; + real_batch rcos = cos(z.real()); + real_batch rsin = sin(z.real()); + real_batch icosh = cosh(z.imag()); + real_batch isinh = sinh(z.imag()); + return std::make_pair(batch_type(rsin * icosh, rcos * isinh), batch_type(rcos * icosh, -rsin * isinh)); + } + + // sinh + namespace detail + { + /* origin: boost/simd/arch/common/detail/generic/sinh_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch sinh_kernel(batch const& self) noexcept + { + using batch_type = batch; + batch_type sqr_self = self * self; + return detail::horner(sqr_self) + * self; + } + + template + inline batch sinh_kernel(batch const& self) noexcept + { + using batch_type = batch; + batch_type sqrself = self * self; + return fma(self, (detail::horner(sqrself) + / detail::horner1(sqrself)) + * sqrself, + self); + } + } + /* origin: boost/simd/arch/common/simd/function/sinh.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch sinh(batch const& a, requires_arch) noexcept + { + using batch_type = batch; + batch_type half(0.5); + batch_type x = abs(a); + auto lt1 = x < batch_type(1.); + batch_type bts = bitofsign(a); + batch_type z(0.); + if (any(lt1)) + { + z = detail::sinh_kernel(x); + if (all(lt1)) + return z ^ bts; + } + auto test1 = x > (constants::maxlog() - constants::log_2()); + batch_type fac = select(test1, half, batch_type(1.)); + batch_type tmp = exp(x * fac); + batch_type tmp1 = half * tmp; + batch_type r = select(test1, tmp1 * tmp, tmp1 - half / tmp); + return select(lt1, z, r) ^ bts; + } + template + inline batch, A> sinh(const batch, A>& z, requires_arch) noexcept + { + auto x = z.real(); + auto y = z.imag(); + return { sinh(x) * cos(y), cosh(x) * sin(y) }; + } + + // tan + template + inline batch tan(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + const batch_type x = abs(self); + batch_type xr = constants::nan(); + const batch_type n = detail::trigo_reducer::reduce(x, xr); + auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); + auto swap_bit = fma(batch_type(-2.), tmp, n); + auto test = (swap_bit == batch_type(0.)); + const batch_type y = detail::tan_eval(xr, test); + return y ^ bitofsign(self); + } + template + inline batch, A> tan(batch, A> const& z, requires_arch) noexcept + { + using batch_type = batch, A>; + using real_batch = typename batch_type::real_batch; + real_batch d = cos(2 * z.real()) + cosh(2 * z.imag()); + batch_type winf(constants::infinity(), constants::infinity()); + real_batch wreal = sin(2 * z.real()) / d; + real_batch wimag = sinh(2 * z.imag()); + batch_type wres = select(isinf(wimag), batch_type(wreal, real_batch(1.)), batch_type(wreal, wimag / d)); + return select(d == real_batch(0.), winf, wres); + } + + // tanh + namespace detail + { + /* origin: boost/simd/arch/common/detail/generic/tanh_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + struct tanh_kernel; + + template + struct tanh_kernel> + { + using batch_type = batch; + static inline batch_type tanh(const batch_type& x) noexcept + { + batch_type sqrx = x * x; + return fma(detail::horner(sqrx) + * sqrx, + x, x); + } + + static inline batch_type cotanh(const batch_type& x) noexcept + { + return batch_type(1.) / tanh(x); + } + }; + + template + struct tanh_kernel> + { + using batch_type = batch; + static inline batch_type tanh(const batch_type& x) noexcept + { + batch_type sqrx = x * x; + return fma(sqrx * p(sqrx) / q(sqrx), x, x); + } + + static inline batch_type cotanh(const batch_type& x) noexcept + { + batch_type sqrx = x * x; + batch_type qval = q(sqrx); + return qval / (x * fma(p(sqrx), sqrx, qval)); + } + + static inline batch_type p(const batch_type& x) noexcept + { + return detail::horner(x); + } + + static inline batch_type q(const batch_type& x) noexcept + { + return detail::horner1(x); + } + }; + + } + /* origin: boost/simd/arch/common/simd/function/tanh.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch tanh(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type one(1.); + batch_type x = abs(self); + auto test = x < (batch_type(5.) / batch_type(8.)); + batch_type bts = bitofsign(self); + batch_type z = one; + if (any(test)) + { + z = detail::tanh_kernel::tanh(x); + if (all(test)) + return z ^ bts; + } + batch_type r = fma(batch_type(-2.), one / (one + exp(x + x)), one); + return select(test, z, r) ^ bts; + } + template + inline batch, A> tanh(const batch, A>& z, requires_arch) noexcept + { + using real_batch = typename batch, A>::real_batch; + auto x = z.real(); + auto y = z.imag(); + real_batch two(2); + auto d = cosh(two * x) + cos(two * y); + return { sinh(two * x) / d, sin(two * y) / d }; + } + + } + +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp new file mode 100644 index 0000000000..be1da61358 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp @@ -0,0 +1,1657 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX_HPP +#define XSIMD_AVX_HPP + +#include +#include +#include + +#include "../types/xsimd_avx_register.hpp" + +namespace xsimd +{ + + namespace kernel + { + using namespace types; + + // fwd + template + inline batch insert(batch const& self, T val, index, requires_arch) noexcept; + + namespace detail + { + inline void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept + { + low = _mm256_castsi256_si128(val); + high = _mm256_extractf128_si256(val, 1); + } + inline void split_avx(__m256 val, __m128& low, __m128& high) noexcept + { + low = _mm256_castps256_ps128(val); + high = _mm256_extractf128_ps(val, 1); + } + inline void split_avx(__m256d val, __m128d& low, __m128d& high) noexcept + { + low = _mm256_castpd256_pd128(val); + high = _mm256_extractf128_pd(val, 1); + } + inline __m256i merge_sse(__m128i low, __m128i high) noexcept + { + return _mm256_insertf128_si256(_mm256_castsi128_si256(low), high, 1); + } + inline __m256 merge_sse(__m128 low, __m128 high) noexcept + { + return _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1); + } + inline __m256d merge_sse(__m128d low, __m128d high) noexcept + { + return _mm256_insertf128_pd(_mm256_castpd128_pd256(low), high, 1); + } + template + inline __m256i fwd_to_sse(F f, __m256i self) noexcept + { + __m128i self_low, self_high; + split_avx(self, self_low, self_high); + __m128i res_low = f(self_low); + __m128i res_high = f(self_high); + return merge_sse(res_low, res_high); + } + template + inline __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept + { + __m128i self_low, self_high, other_low, other_high; + split_avx(self, self_low, self_high); + split_avx(other, other_low, other_high); + __m128i res_low = f(self_low, other_low); + __m128i res_high = f(self_high, other_high); + return merge_sse(res_low, res_high); + } + template + inline __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept + { + __m128i self_low, self_high; + split_avx(self, self_low, self_high); + __m128i res_low = f(self_low, other); + __m128i res_high = f(self_high, other); + return merge_sse(res_low, res_high); + } + } + + // abs + template + inline batch abs(batch const& self, requires_arch) noexcept + { + __m256 sign_mask = _mm256_set1_ps(-0.f); // -0.f = 1 << 31 + return _mm256_andnot_ps(sign_mask, self); + } + template + inline batch abs(batch const& self, requires_arch) noexcept + { + __m256d sign_mask = _mm256_set1_pd(-0.f); // -0.f = 1 << 31 + return _mm256_andnot_pd(sign_mask, self); + } + + // add + template ::value, void>::type> + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return add(batch(s), batch(o)); }, + self, other); + } + template + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_add_ps(self, other); + } + template + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_add_pd(self, other); + } + + // all + template + inline bool all(batch_bool const& self, requires_arch) noexcept + { + return _mm256_testc_ps(self, batch_bool(true)) != 0; + } + template + inline bool all(batch_bool const& self, requires_arch) noexcept + { + return _mm256_testc_pd(self, batch_bool(true)) != 0; + } + template ::value, void>::type> + inline bool all(batch_bool const& self, requires_arch) noexcept + { + return _mm256_testc_si256(self, batch_bool(true)) != 0; + } + + // any + template + inline bool any(batch_bool const& self, requires_arch) noexcept + { + return !_mm256_testz_ps(self, self); + } + template + inline bool any(batch_bool const& self, requires_arch) noexcept + { + return !_mm256_testz_pd(self, self); + } + template ::value, void>::type> + inline bool any(batch_bool const& self, requires_arch) noexcept + { + return !_mm256_testz_si256(self, self); + } + + // batch_bool_cast + template + inline batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept + { + return { bitwise_cast(batch(self.data)).data }; + } + + // bitwise_and + template + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_and_ps(self, other); + } + template + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_and_pd(self, other); + } + + template + inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_and_ps(self, other); + } + template + inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_and_pd(self, other); + } + + template ::value, void>::type> + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_and(batch(s), batch(o)); }, + self, other); + } + template ::value, void>::type> + inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_and(batch(s), batch(o)); }, + self, other); + } + + // bitwise_andnot + template + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_andnot_ps(other, self); + } + template + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_andnot_pd(other, self); + } + + template + inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_andnot_ps(other, self); + } + template + inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_andnot_pd(other, self); + } + + template ::value, void>::type> + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_andnot(batch(s), batch(o)); }, + self, other); + } + template ::value, void>::type> + inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_andnot(batch(s), batch(o)); }, + self, other); + } + + // bitwise_lshift + template ::value, void>::type> + inline batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept + { return bitwise_lshift(batch(s), o, sse4_2 {}); }, + self, other); + } + + // bitwise_not + template ::value, void>::type> + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s) noexcept + { return bitwise_not(batch(s), sse4_2 {}); }, + self); + } + template ::value, void>::type> + inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s) noexcept + { return bitwise_not(batch_bool(s), sse4_2 {}); }, + self); + } + + // bitwise_or + template + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_or_ps(self, other); + } + template + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_or_pd(self, other); + } + template + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_or_ps(self, other); + } + template + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_or_pd(self, other); + } + template ::value, void>::type> + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_or(batch(s), batch(o)); }, + self, other); + } + template ::value, void>::type> + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_or(batch_bool(s), batch_bool(o)); }, + self, other); + } + + // bitwise_rshift + template ::value, void>::type> + inline batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept + { return bitwise_rshift(batch(s), o, sse4_2 {}); }, + self, other); + } + + // bitwise_xor + template + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_xor_ps(self, other); + } + template + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_xor_pd(self, other); + } + template + inline batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_xor_ps(self, other); + } + template + inline batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_xor_pd(self, other); + } + template ::value, void>::type> + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_xor(batch(s), batch(o), sse4_2 {}); }, + self, other); + } + template ::value, void>::type> + inline batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_xor(batch_bool(s), batch_bool(o), sse4_2 {}); }, + self, other); + } + + // bitwise_cast + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm256_castsi256_ps(self); + } + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm256_castsi256_pd(self); + } + template ::type>::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return batch(self.data); + } + template + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm256_castps_pd(self); + } + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm256_castps_si256(self); + } + template + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm256_castpd_ps(self); + } + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm256_castpd_si256(self); + } + + // bitwise_not + template + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1))); + } + template + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1))); + } + template + inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1))); + } + template + inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1))); + } + + // broadcast + template ::value, void>::type> + inline batch broadcast(T val, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_set1_epi8(val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_set1_epi16(val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_set1_epi32(val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_set1_epi64x(val); + } + else + { + assert(false && "unsupported"); + return {}; + } + } + template + inline batch broadcast(float val, requires_arch) noexcept + { + return _mm256_set1_ps(val); + } + template + inline batch broadcast(double val, requires_arch) noexcept + { + return _mm256_set1_pd(val); + } + + // ceil + template + inline batch ceil(batch const& self, requires_arch) noexcept + { + return _mm256_ceil_ps(self); + } + template + inline batch ceil(batch const& self, requires_arch) noexcept + { + return _mm256_ceil_pd(self); + } + + namespace detail + { + // On clang, _mm256_extractf128_ps is built upon build_shufflevector + // which require index parameter to be a constant + template + inline B get_half_complex_f(const B& real, const B& imag) noexcept + { + __m128 tmp0 = _mm256_extractf128_ps(real, index); + __m128 tmp1 = _mm256_extractf128_ps(imag, index); + __m128 tmp2 = _mm_unpackhi_ps(tmp0, tmp1); + tmp0 = _mm_unpacklo_ps(tmp0, tmp1); + __m256 res = real; + res = _mm256_insertf128_ps(res, tmp0, 0); + res = _mm256_insertf128_ps(res, tmp2, 1); + return res; + } + template + inline B get_half_complex_d(const B& real, const B& imag) noexcept + { + __m128d tmp0 = _mm256_extractf128_pd(real, index); + __m128d tmp1 = _mm256_extractf128_pd(imag, index); + __m128d tmp2 = _mm_unpackhi_pd(tmp0, tmp1); + tmp0 = _mm_unpacklo_pd(tmp0, tmp1); + __m256d res = real; + res = _mm256_insertf128_pd(res, tmp0, 0); + res = _mm256_insertf128_pd(res, tmp2, 1); + return res; + } + + // complex_low + template + inline batch complex_low(batch, A> const& self, requires_arch) noexcept + { + return get_half_complex_f<0>(self.real(), self.imag()); + } + template + inline batch complex_low(batch, A> const& self, requires_arch) noexcept + { + return get_half_complex_d<0>(self.real(), self.imag()); + } + + // complex_high + template + inline batch complex_high(batch, A> const& self, requires_arch) noexcept + { + return get_half_complex_f<1>(self.real(), self.imag()); + } + template + inline batch complex_high(batch, A> const& self, requires_arch) noexcept + { + return get_half_complex_d<1>(self.real(), self.imag()); + } + } + + // fast_cast + namespace detail + { + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm256_cvtepi32_ps(self); + } + + template + inline batch fast_cast(batch const& v, batch const&, requires_arch) noexcept + { + // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse + // adapted to avx + __m256i msk_lo = _mm256_set1_epi32(0xFFFF); + __m256 cnst65536f = _mm256_set1_ps(65536.0f); + + __m256i v_lo = bitwise_and(batch(v), batch(msk_lo)); /* extract the 16 lowest significant bits of self */ + __m256i v_hi = bitwise_rshift(batch(v), 16, avx {}); /* 16 most significant bits of v */ + __m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding */ + __m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding */ + v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding */ + return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */ + } + + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm256_cvttps_epi32(self); + } + + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm256_castps_si256( + _mm256_blendv_ps(_mm256_castsi256_ps(_mm256_cvttps_epi32(self)), + _mm256_xor_ps( + _mm256_castsi256_ps(_mm256_cvttps_epi32(_mm256_sub_ps(self, _mm256_set1_ps(1u << 31)))), + _mm256_castsi256_ps(_mm256_set1_epi32(1u << 31))), + _mm256_cmp_ps(self, _mm256_set1_ps(1u << 31), _CMP_GE_OQ))); + } + } + + // div + template + inline batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_div_ps(self, other); + } + template + inline batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_div_pd(self, other); + } + + // eq + template + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_cmp_ps(self, other, _CMP_EQ_OQ); + } + template + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_cmp_pd(self, other, _CMP_EQ_OQ); + } + template + inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return ~(self != other); + } + template + inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return ~(self != other); + } + template ::value, void>::type> + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return eq(batch(s), batch(o), sse4_2 {}); }, + self, other); + } + + template ::value, void>::type> + inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return ~(self != other); + } + + // floor + template + inline batch floor(batch const& self, requires_arch) noexcept + { + return _mm256_floor_ps(self); + } + template + inline batch floor(batch const& self, requires_arch) noexcept + { + return _mm256_floor_pd(self); + } + + // from_mask + template + inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + alignas(A::alignment()) static const uint64_t lut32[] = { + 0x0000000000000000ul, + 0x00000000FFFFFFFFul, + 0xFFFFFFFF00000000ul, + 0xFFFFFFFFFFFFFFFFul, + }; + assert(!(mask & ~0xFFul) && "inbound mask"); + return _mm256_castsi256_ps(_mm256_setr_epi64x(lut32[mask & 0x3], lut32[(mask >> 2) & 0x3], lut32[(mask >> 4) & 0x3], lut32[mask >> 6])); + } + template + inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + alignas(A::alignment()) static const uint64_t lut64[][4] = { + { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul }, + { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul }, + { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul }, + { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul }, + { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, + { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, + { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, + { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, + { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, + { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, + { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, + { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, + { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, + { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, + { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, + { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, + }; + assert(!(mask & ~0xFul) && "inbound mask"); + return _mm256_castsi256_pd(_mm256_load_si256((const __m256i*)lut64[mask])); + } + template ::value, void>::type> + inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + alignas(A::alignment()) static const uint32_t lut32[] = { + 0x00000000, + 0x000000FF, + 0x0000FF00, + 0x0000FFFF, + 0x00FF0000, + 0x00FF00FF, + 0x00FFFF00, + 0x00FFFFFF, + 0xFF000000, + 0xFF0000FF, + 0xFF00FF00, + 0xFF00FFFF, + 0xFFFF0000, + 0xFFFF00FF, + 0xFFFFFF00, + 0xFFFFFFFF, + }; + alignas(A::alignment()) static const uint64_t lut64[] = { + 0x0000000000000000ul, + 0x000000000000FFFFul, + 0x00000000FFFF0000ul, + 0x00000000FFFFFFFFul, + 0x0000FFFF00000000ul, + 0x0000FFFF0000FFFFul, + 0x0000FFFFFFFF0000ul, + 0x0000FFFFFFFFFFFFul, + 0xFFFF000000000000ul, + 0xFFFF00000000FFFFul, + 0xFFFF0000FFFF0000ul, + 0xFFFF0000FFFFFFFFul, + 0xFFFFFFFF00000000ul, + 0xFFFFFFFF0000FFFFul, + 0xFFFFFFFFFFFF0000ul, + 0xFFFFFFFFFFFFFFFFul, + }; + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + assert(!(mask & ~0xFFFFFFFFul) && "inbound mask"); + return _mm256_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], + lut32[(mask >> 8) & 0xF], lut32[(mask >> 12) & 0xF], + lut32[(mask >> 16) & 0xF], lut32[(mask >> 20) & 0xF], + lut32[(mask >> 24) & 0xF], lut32[mask >> 28]); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + assert(!(mask & ~0xFFFFul) && "inbound mask"); + return _mm256_setr_epi64x(lut64[mask & 0xF], lut64[(mask >> 4) & 0xF], lut64[(mask >> 8) & 0xF], lut64[(mask >> 12) & 0xF]); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_castps_si256(from_mask(batch_bool {}, mask, avx {})); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_castpd_si256(from_mask(batch_bool {}, mask, avx {})); + } + } + + // haddp + template + inline batch haddp(batch const* row, requires_arch) noexcept + { + // row = (a,b,c,d,e,f,g,h) + // tmp0 = (a0+a1, a2+a3, b0+b1, b2+b3, a4+a5, a6+a7, b4+b5, b6+b7) + __m256 tmp0 = _mm256_hadd_ps(row[0], row[1]); + // tmp1 = (c0+c1, c2+c3, d1+d2, d2+d3, c4+c5, c6+c7, d4+d5, d6+d7) + __m256 tmp1 = _mm256_hadd_ps(row[2], row[3]); + // tmp1 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3, + // a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7) + tmp1 = _mm256_hadd_ps(tmp0, tmp1); + // tmp0 = (e0+e1, e2+e3, f0+f1, f2+f3, e4+e5, e6+e7, f4+f5, f6+f7) + tmp0 = _mm256_hadd_ps(row[4], row[5]); + // tmp2 = (g0+g1, g2+g3, h0+h1, h2+h3, g4+g5, g6+g7, h4+h5, h6+h7) + __m256 tmp2 = _mm256_hadd_ps(row[6], row[7]); + // tmp2 = (e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3, + // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7) + tmp2 = _mm256_hadd_ps(tmp0, tmp2); + // tmp0 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3, + // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7) + tmp0 = _mm256_blend_ps(tmp1, tmp2, 0b11110000); + // tmp1 = (a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7, + // e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3) + tmp1 = _mm256_permute2f128_ps(tmp1, tmp2, 0x21); + return _mm256_add_ps(tmp0, tmp1); + } + template + inline batch haddp(batch const* row, requires_arch) noexcept + { + // row = (a,b,c,d) + // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3) + __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]); + // tmp1 = (c0+c1, d0+d1, c2+c3, d2+d3) + __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]); + // tmp2 = (a0+a1, b0+b1, c2+c3, d2+d3) + __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100); + // tmp1 = (a2+a3, b2+b3, c2+c3, d2+d3) + tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21); + return _mm256_add_pd(tmp1, tmp2); + } + + // insert + template ::value, void>::type> + inline batch insert(batch const& self, T val, index pos, requires_arch) noexcept + { +#if !defined(_MSC_VER) || _MSC_VER > 1900 + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_insert_epi8(self, val, I); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_insert_epi16(self, val, I); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_insert_epi32(self, val, I); + } + else + { + return insert(self, val, pos, generic {}); + } +#endif + return insert(self, val, pos, generic {}); + } + + // isnan + template + inline batch_bool isnan(batch const& self, requires_arch) noexcept + { + return _mm256_cmp_ps(self, self, _CMP_UNORD_Q); + } + template + inline batch_bool isnan(batch const& self, requires_arch) noexcept + { + return _mm256_cmp_pd(self, self, _CMP_UNORD_Q); + } + + // le + template + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_cmp_ps(self, other, _CMP_LE_OQ); + } + template + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_cmp_pd(self, other, _CMP_LE_OQ); + } + + // load_aligned + template ::value, void>::type> + inline batch load_aligned(T const* mem, convert, requires_arch) noexcept + { + return _mm256_load_si256((__m256i const*)mem); + } + template + inline batch load_aligned(float const* mem, convert, requires_arch) noexcept + { + return _mm256_load_ps(mem); + } + template + inline batch load_aligned(double const* mem, convert, requires_arch) noexcept + { + return _mm256_load_pd(mem); + } + + namespace detail + { + // load_complex + template + inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + using batch_type = batch; + __m128 tmp0 = _mm256_extractf128_ps(hi, 0); + __m128 tmp1 = _mm256_extractf128_ps(hi, 1); + __m128 tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0)); + __m128 tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1)); + batch_type real = _mm256_castps128_ps256(tmp_real); + batch_type imag = _mm256_castps128_ps256(tmp_imag); + + tmp0 = _mm256_extractf128_ps(lo, 0); + tmp1 = _mm256_extractf128_ps(lo, 1); + tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0)); + tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1)); + real = _mm256_insertf128_ps(real, tmp_real, 1); + imag = _mm256_insertf128_ps(imag, tmp_imag, 1); + return { real, imag }; + } + template + inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + using batch_type = batch; + __m128d tmp0 = _mm256_extractf128_pd(hi, 0); + __m128d tmp1 = _mm256_extractf128_pd(hi, 1); + batch_type real = _mm256_castpd128_pd256(_mm_unpacklo_pd(tmp0, tmp1)); + batch_type imag = _mm256_castpd128_pd256(_mm_unpackhi_pd(tmp0, tmp1)); + + tmp0 = _mm256_extractf128_pd(lo, 0); + tmp1 = _mm256_extractf128_pd(lo, 1); + __m256d re_tmp1 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 1); + __m256d im_tmp1 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 1); + real = _mm256_blend_pd(real, re_tmp1, 12); + imag = _mm256_blend_pd(imag, im_tmp1, 12); + return { real, imag }; + } + } + + // load_unaligned + template ::value, void>::type> + inline batch load_unaligned(T const* mem, convert, requires_arch) noexcept + { + return _mm256_loadu_si256((__m256i const*)mem); + } + template + inline batch load_unaligned(float const* mem, convert, requires_arch) noexcept + { + return _mm256_loadu_ps(mem); + } + template + inline batch load_unaligned(double const* mem, convert, requires_arch) noexcept + { + return _mm256_loadu_pd(mem); + } + + // lt + template + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_cmp_ps(self, other, _CMP_LT_OQ); + } + template + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_cmp_pd(self, other, _CMP_LT_OQ); + } + + template ::value, void>::type> + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return lt(batch(s), batch(o)); }, + self, other); + } + + // mask + template ::value, void>::type> + inline uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2) + { + __m128i self_low, self_high; + detail::split_avx(self, self_low, self_high); + return mask(batch_bool(self_low), sse4_2 {}) | (mask(batch_bool(self_high), sse4_2 {}) << (128 / (8 * sizeof(T)))); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_movemask_ps(_mm256_castsi256_ps(self)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_movemask_pd(_mm256_castsi256_pd(self)); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + return _mm256_movemask_ps(self); + } + + template + inline uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + return _mm256_movemask_pd(self); + } + + // max + template + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_max_ps(self, other); + } + template + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_max_pd(self, other); + } + template ::value, void>::type> + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return select(self > other, self, other); + } + + // min + template + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_min_ps(self, other); + } + template + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_min_pd(self, other); + } + template ::value, void>::type> + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return select(self <= other, self, other); + } + + // mul + template + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_mul_ps(self, other); + } + template + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_mul_pd(self, other); + } + + // nearbyint + template + inline batch nearbyint(batch const& self, requires_arch) noexcept + { + return _mm256_round_ps(self, _MM_FROUND_TO_NEAREST_INT); + } + template + inline batch nearbyint(batch const& self, requires_arch) noexcept + { + return _mm256_round_pd(self, _MM_FROUND_TO_NEAREST_INT); + } + + // nearbyint_as_int + template + inline batch nearbyint_as_int(batch const& self, + requires_arch) noexcept + { + return _mm256_cvtps_epi32(self); + } + + // neg + template ::value, void>::type> + inline batch neg(batch const& self, requires_arch) noexcept + { + return 0 - self; + } + template + batch neg(batch const& self, requires_arch) + { + return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); + } + template + inline batch neg(batch const& self, requires_arch) noexcept + { + return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000))); + } + + // neq + template + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_cmp_ps(self, other, _CMP_NEQ_UQ); + } + template + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_cmp_pd(self, other, _CMP_NEQ_UQ); + } + template ::value, void>::type> + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return ~(self == other); + } + + template + inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_xor_ps(self, other); + } + template + inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_xor_pd(self, other); + } + template ::value, void>::type> + inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(self.data), _mm256_castsi256_ps(other.data))); + } + + // reciprocal + template + inline batch reciprocal(batch const& self, + kernel::requires_arch) noexcept + { + return _mm256_rcp_ps(self); + } + + // reduce_add + template + inline float reduce_add(batch const& rhs, requires_arch) noexcept + { + // Warning about _mm256_hadd_ps: + // _mm256_hadd_ps(a,b) gives + // (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't + // rely on a naive use of this method + // rhs = (x0, x1, x2, x3, x4, x5, x6, x7) + // tmp = (x4, x5, x6, x7, x0, x1, x2, x3) + __m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1); + // tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7) + tmp = _mm256_add_ps(rhs, tmp); + // tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -) + tmp = _mm256_hadd_ps(tmp, tmp); + // tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -) + tmp = _mm256_hadd_ps(tmp, tmp); + return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0)); + } + template + inline double reduce_add(batch const& rhs, requires_arch) noexcept + { + // rhs = (x0, x1, x2, x3) + // tmp = (x2, x3, x0, x1) + __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1); + // tmp = (x2+x0, x3+x1, -, -) + tmp = _mm256_add_pd(rhs, tmp); + // tmp = (x2+x0+x3+x1, -, -, -) + tmp = _mm256_hadd_pd(tmp, tmp); + return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0)); + } + template ::value, void>::type> + inline T reduce_add(batch const& self, requires_arch) noexcept + { + __m128i low, high; + detail::split_avx(self, low, high); + batch blow(low), bhigh(high); + return reduce_add(blow) + reduce_add(bhigh); + } + + // reduce_max + template ::type> + inline T reduce_max(batch const& self, requires_arch) noexcept + { + constexpr auto mask = detail::shuffle(1, 0); + batch step = _mm256_permute2f128_si256(self, self, mask); + batch acc = max(self, step); + __m128i low = _mm256_castsi256_si128(acc); + return reduce_max(batch(low)); + } + + // reduce_min + template ::type> + inline T reduce_min(batch const& self, requires_arch) noexcept + { + constexpr auto mask = detail::shuffle(1, 0); + batch step = _mm256_permute2f128_si256(self, self, mask); + batch acc = min(self, step); + __m128i low = _mm256_castsi256_si128(acc); + return reduce_min(batch(low)); + } + + // rsqrt + template + inline batch rsqrt(batch const& val, requires_arch) noexcept + { + return _mm256_rsqrt_ps(val); + } + template + inline batch rsqrt(batch const& val, requires_arch) noexcept + { + return _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(val))); + } + + // sadd + template ::value, void>::type> + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + auto mask = (other >> (8 * sizeof(T) - 1)); + auto self_pos_branch = min(std::numeric_limits::max() - other, self); + auto self_neg_branch = max(std::numeric_limits::min() - other, self); + return other + select(batch_bool(mask.data), self_neg_branch, self_pos_branch); + } + else + { + const auto diffmax = std::numeric_limits::max() - self; + const auto mindiff = min(diffmax, other); + return self + mindiff; + } + } + + // select + template + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm256_blendv_ps(false_br, true_br, cond); + } + template + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm256_blendv_pd(false_br, true_br, cond); + } + template ::value, void>::type> + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + __m128i cond_low, cond_hi; + detail::split_avx(cond, cond_low, cond_hi); + + __m128i true_low, true_hi; + detail::split_avx(true_br, true_low, true_hi); + + __m128i false_low, false_hi; + detail::split_avx(false_br, false_low, false_hi); + + __m128i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), sse4_2 {}); + __m128i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), sse4_2 {}); + return detail::merge_sse(res_low, res_hi); + } + template ::value, void>::type> + inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return select(batch_bool { Values... }, true_br, false_br, avx2 {}); + } + + template + inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + constexpr auto mask = batch_bool_constant, Values...>::mask(); + return _mm256_blend_ps(false_br, true_br, mask); + } + + template + inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + constexpr auto mask = batch_bool_constant, Values...>::mask(); + return _mm256_blend_pd(false_br, true_br, mask); + } + + // set + template + inline batch set(batch const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch::size, "consistent init"); + return _mm256_setr_ps(values...); + } + + template + inline batch set(batch const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch::size, "consistent init"); + return _mm256_setr_pd(values...); + } + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3) noexcept + { + return _mm256_set_epi64x(v3, v2, v1, v0); + } + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept + { + return _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7); + } + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept + { + return _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); + } + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, + T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept + { + return _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); + } + + template ::value, void>::type> + inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; + } + + template + inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + return _mm256_castsi256_ps(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); + } + + template + inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + return _mm256_castsi256_pd(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); + } + + // slide_left + template + inline batch slide_left(batch const& x, requires_arch) noexcept + { + constexpr unsigned BitCount = N * 8; + if (BitCount == 0) + { + return x; + } + if (BitCount >= 256) + { + return batch(T(0)); + } + if (BitCount > 128) + { + constexpr unsigned M = (BitCount - 128) / 8; + __m128i low = _mm256_castsi256_si128(x); + auto y = _mm_slli_si128(low, M); + __m256i zero = _mm256_setzero_si256(); + return _mm256_insertf128_si256(zero, y, 1); + } + if (BitCount == 128) + { + __m128i low = _mm256_castsi256_si128(x); + __m256i zero = _mm256_setzero_si256(); + return _mm256_insertf128_si256(zero, low, 1); + } + // shifting by [0, 128[ bits + constexpr unsigned M = BitCount / 8; + + __m128i low = _mm256_castsi256_si128(x); + auto ylow = _mm_slli_si128(low, M); + auto zlow = _mm_srli_si128(low, 16 - M); + + __m128i high = _mm256_extractf128_si256(x, 1); + auto yhigh = _mm_slli_si128(high, M); + + __m256i res = _mm256_castsi128_si256(ylow); + return _mm256_insertf128_si256(res, _mm_or_si128(yhigh, zlow), 1); + } + + // slide_right + template + inline batch slide_right(batch const& x, requires_arch) noexcept + { + constexpr unsigned BitCount = N * 8; + if (BitCount == 0) + { + return x; + } + if (BitCount >= 256) + { + return batch(T(0)); + } + if (BitCount > 128) + { + constexpr unsigned M = (BitCount - 128) / 8; + __m128i high = _mm256_extractf128_si256(x, 1); + __m128i y = _mm_srli_si128(high, M); + __m256i zero = _mm256_setzero_si256(); + return _mm256_insertf128_si256(zero, y, 0); + } + if (BitCount == 128) + { + __m128i high = _mm256_extractf128_si256(x, 1); + return _mm256_castsi128_si256(high); + } + // shifting by [0, 128[ bits + constexpr unsigned M = BitCount / 8; + + __m128i low = _mm256_castsi256_si128(x); + auto ylow = _mm_srli_si128(low, M); + + __m128i high = _mm256_extractf128_si256(x, 1); + auto yhigh = _mm_srli_si128(high, M); + auto zhigh = _mm_slli_si128(high, 16 - M); + + __m256i res = _mm256_castsi128_si256(_mm_or_si128(ylow, zhigh)); + return _mm256_insertf128_si256(res, yhigh, 1); + } + + // sqrt + template + inline batch sqrt(batch const& val, requires_arch) noexcept + { + return _mm256_sqrt_ps(val); + } + template + inline batch sqrt(batch const& val, requires_arch) noexcept + { + return _mm256_sqrt_pd(val); + } + + // ssub + template ::value, void>::type> + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + return sadd(self, -other); + } + else + { + const auto diff = min(self, other); + return self - diff; + } + } + + // store_aligned + template ::value, void>::type> + inline void store_aligned(T* mem, batch const& self, requires_arch) noexcept + { + return _mm256_store_si256((__m256i*)mem, self); + } + template ::value, void>::type> + inline void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept + { + return _mm256_store_si256((__m256i*)mem, self); + } + template + inline void store_aligned(float* mem, batch const& self, requires_arch) noexcept + { + return _mm256_store_ps(mem, self); + } + template + inline void store_aligned(double* mem, batch const& self, requires_arch) noexcept + { + return _mm256_store_pd(mem, self); + } + + // store_unaligned + template ::value, void>::type> + inline void store_unaligned(T* mem, batch const& self, requires_arch) noexcept + { + return _mm256_storeu_si256((__m256i*)mem, self); + } + template ::value, void>::type> + inline void store_unaligned(T* mem, batch_bool const& self, requires_arch) noexcept + { + return _mm256_storeu_si256((__m256i*)mem, self); + } + template + inline void store_unaligned(float* mem, batch const& self, requires_arch) noexcept + { + return _mm256_storeu_ps(mem, self); + } + template + inline void store_unaligned(double* mem, batch const& self, requires_arch) noexcept + { + return _mm256_storeu_pd(mem, self); + } + + // sub + template ::value, void>::type> + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return sub(batch(s), batch(o)); }, + self, other); + } + template + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_sub_ps(self, other); + } + template + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_sub_pd(self, other); + } + + // swizzle + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch) noexcept + { + // duplicate low and high part of input + __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1)); + __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0); + + __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self)); + __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1); + + // normalize mask + batch_constant, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask; + + // permute within each lane + __m256 r0 = _mm256_permutevar_ps(low_low, (batch)half_mask); + __m256 r1 = _mm256_permutevar_ps(hi_hi, (batch)half_mask); + + // mask to choose the right lane + batch_bool_constant, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask; + + // blend the two permutes + constexpr auto mask = blend_mask.mask(); + return _mm256_blend_ps(r0, r1, mask); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3>, requires_arch) noexcept + { + // duplicate low and high part of input + __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1)); + __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0); + + __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self)); + __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1); + + // normalize mask + batch_constant, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask; + + // permute within each lane + __m256d r0 = _mm256_permutevar_pd(low_low, (batch)half_mask); + __m256d r1 = _mm256_permutevar_pd(hi_hi, (batch)half_mask); + + // mask to choose the right lane + batch_bool_constant, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask; + + // blend the two permutes + constexpr auto mask = blend_mask.mask(); + return _mm256_blend_pd(r0, r1, mask); + } + template = 0> + inline batch swizzle(batch const& self, + batch_constant, + V0, + V1, + V2, + V3, + V4, + V5, + V6, + V7> const& mask, + requires_arch) noexcept + { + return bitwise_cast( + swizzle(bitwise_cast(self), mask)); + } + + template = 0> + inline batch + swizzle(batch const& self, + batch_constant, V0, V1, V2, V3> const& mask, + requires_arch) noexcept + { + return bitwise_cast( + swizzle(bitwise_cast(self), mask)); + } + + // trunc + template + inline batch trunc(batch const& self, requires_arch) noexcept + { + return _mm256_round_ps(self, _MM_FROUND_TO_ZERO); + } + template + inline batch trunc(batch const& self, requires_arch) noexcept + { + return _mm256_round_pd(self, _MM_FROUND_TO_ZERO); + } + + // zip_hi + template ::value, void>::type> + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2) + { + // extract high word + __m128i self_hi = _mm256_extractf128_si256(self, 1); + __m128i other_hi = _mm256_extractf128_si256(other, 1); + + // interleave + __m128i res_lo, res_hi; + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + res_lo = _mm_unpacklo_epi8(self_hi, other_hi); + res_hi = _mm_unpackhi_epi8(self_hi, other_hi); + } + else + { + res_lo = _mm_unpacklo_epi16(self_hi, other_hi); + res_hi = _mm_unpackhi_epi16(self_hi, other_hi); + } + + // fuse + return _mm256_castps_si256( + _mm256_insertf128_ps( + _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)), + _mm_castsi128_ps(res_hi), + 1)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other)); + auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other)); + return _mm256_castps_si256(_mm256_permute2f128_ps(lo, hi, 0x31)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other)); + auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other)); + return _mm256_castpd_si256(_mm256_permute2f128_pd(lo, hi, 0x31)); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + auto lo = _mm256_unpacklo_ps(self, other); + auto hi = _mm256_unpackhi_ps(self, other); + return _mm256_permute2f128_ps(lo, hi, 0x31); + } + template + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + auto lo = _mm256_unpacklo_pd(self, other); + auto hi = _mm256_unpackhi_pd(self, other); + return _mm256_permute2f128_pd(lo, hi, 0x31); + } + + // zip_lo + template ::value, void>::type> + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2) + { + // extract low word + __m128i self_lo = _mm256_extractf128_si256(self, 0); + __m128i other_lo = _mm256_extractf128_si256(other, 0); + + // interleave + __m128i res_lo, res_hi; + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + res_lo = _mm_unpacklo_epi8(self_lo, other_lo); + res_hi = _mm_unpackhi_epi8(self_lo, other_lo); + } + else + { + res_lo = _mm_unpacklo_epi16(self_lo, other_lo); + res_hi = _mm_unpackhi_epi16(self_lo, other_lo); + } + + // fuse + return _mm256_castps_si256( + _mm256_insertf128_ps( + _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)), + _mm_castsi128_ps(res_hi), + 1)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other)); + auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other)); + return _mm256_castps_si256(_mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other)); + auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other)); + return _mm256_castpd_si256(_mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1)); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + + template + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + auto lo = _mm256_unpacklo_ps(self, other); + auto hi = _mm256_unpackhi_ps(self, other); + return _mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1); + } + template + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + auto lo = _mm256_unpacklo_pd(self, other); + auto hi = _mm256_unpackhi_pd(self, other); + return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1); + } + } +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp new file mode 100644 index 0000000000..8d0fcc27a4 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp @@ -0,0 +1,950 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX2_HPP +#define XSIMD_AVX2_HPP + +#include +#include + +#include "../types/xsimd_avx2_register.hpp" + +namespace xsimd +{ + + namespace kernel + { + using namespace types; + + // abs + template ::value, void>::type> + inline batch abs(batch const& self, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_abs_epi8(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_abs_epi16(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_abs_epi32(self); + } + else + { + return abs(self, avx {}); + } + } + return self; + } + + // add + template ::value, void>::type> + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_add_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_add_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_add_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_add_epi64(self, other); + } + else + { + return add(self, other, avx {}); + } + } + + // bitwise_and + template ::value, void>::type> + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_and_si256(self, other); + } + template ::value, void>::type> + inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_and_si256(self, other); + } + + // bitwise_andnot + template ::value, void>::type> + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_andnot_si256(other, self); + } + template ::value, void>::type> + inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_andnot_si256(other, self); + } + + // bitwise_not + template ::value, void>::type> + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm256_xor_si256(self, _mm256_set1_epi32(-1)); + } + template ::value, void>::type> + inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return _mm256_xor_si256(self, _mm256_set1_epi32(-1)); + } + + // bitwise_lshift + template ::value, void>::type> + inline batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_slli_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_slli_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_slli_epi64(self, other); + } + else + { + return bitwise_lshift(self, other, avx {}); + } + } + + template ::value, void>::type> + inline batch bitwise_lshift(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_sllv_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_sllv_epi64(self, other); + } + else + { + return bitwise_lshift(self, other, avx {}); + } + } + + // bitwise_or + template ::value, void>::type> + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_or_si256(self, other); + } + template ::value, void>::type> + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_or_si256(self, other); + } + + // bitwise_rshift + template ::value, void>::type> + inline batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF); + __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self); + __m256i res = _mm256_srai_epi16(self, other); + return _mm256_or_si256( + detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_and(batch(s), batch(o), sse4_2 {}); }, + sign_mask, cmp_is_negative), + _mm256_andnot_si256(sign_mask, res)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_srai_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_srai_epi32(self, other); + } + else + { + return bitwise_rshift(self, other, avx {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_srli_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_srli_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_srli_epi64(self, other); + } + else + { + return bitwise_rshift(self, other, avx {}); + } + } + } + + template ::value, void>::type> + inline batch bitwise_rshift(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_srav_epi32(self, other); + } + else + { + return bitwise_rshift(self, other, avx {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_srlv_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_srlv_epi64(self, other); + } + else + { + return bitwise_rshift(self, other, avx {}); + } + } + } + + // bitwise_xor + template ::value, void>::type> + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_xor_si256(self, other); + } + template ::value, void>::type> + inline batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_xor_si256(self, other); + } + + // complex_low + template + inline batch complex_low(batch, A> const& self, requires_arch) noexcept + { + __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0)); + __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0)); + return _mm256_blend_pd(tmp0, tmp1, 10); + } + + // complex_high + template + inline batch complex_high(batch, A> const& self, requires_arch) noexcept + { + __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2)); + __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0)); + return _mm256_blend_pd(tmp0, tmp1, 10); + } + + // fast_cast + namespace detail + { + + template + inline batch fast_cast(batch const& v, batch const&, requires_arch) noexcept + { + // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse + __m256i msk_lo = _mm256_set1_epi32(0xFFFF); + __m256 cnst65536f = _mm256_set1_ps(65536.0f); + + __m256i v_lo = _mm256_and_si256(v, msk_lo); /* extract the 16 lowest significant bits of self */ + __m256i v_hi = _mm256_srli_epi32(v, 16); /* 16 most significant bits of v */ + __m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding */ + __m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding */ + v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding */ + return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */ + } + + template + inline batch fast_cast(batch const& x, batch const&, requires_arch) noexcept + { + // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx + // adapted to avx + __m256i xH = _mm256_srli_epi64(x, 32); + xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); // 2^84 + __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, + 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000); + __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52 + __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 + return _mm256_add_pd(f, _mm256_castsi256_pd(xL)); + } + + template + inline batch fast_cast(batch const& x, batch const&, requires_arch) noexcept + { + // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx + // adapted to avx + __m256i xH = _mm256_srai_epi32(x, 16); + xH = _mm256_and_si256(xH, _mm256_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF)); + xH = _mm256_add_epi64(xH, _mm256_castpd_si256(_mm256_set1_pd(442721857769029238784.))); // 3*2^67 + __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, + 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000); + __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52 + __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52 + return _mm256_add_pd(f, _mm256_castsi256_pd(xL)); + } + } + + // eq + template ::value, void>::type> + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_cmpeq_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_cmpeq_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_cmpeq_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_cmpeq_epi64(self, other); + } + else + { + return eq(self, other, avx {}); + } + } + + // gather + template = 0, detail::enable_sized_integral_t = 0> + inline batch gather(batch const&, T const* src, batch const& index, + kernel::requires_arch) noexcept + { + // scatter for this one is AVX512F+AVX512VL + return _mm256_i32gather_epi32(reinterpret_cast(src), index, sizeof(T)); + } + + template = 0, detail::enable_sized_integral_t = 0> + inline batch gather(batch const&, T const* src, batch const& index, + kernel::requires_arch) noexcept + { + // scatter for this one is AVX512F+AVX512VL + return _mm256_i64gather_epi64(reinterpret_cast(src), index, sizeof(T)); + } + + template = 0> + inline batch gather(batch const&, float const* src, + batch const& index, + kernel::requires_arch) noexcept + { + // scatter for this one is AVX512F+AVX512VL + return _mm256_i32gather_ps(src, index, sizeof(float)); + } + + template = 0> + inline batch gather(batch const&, double const* src, + batch const& index, + requires_arch) noexcept + { + // scatter for this one is AVX512F+AVX512VL + return _mm256_i64gather_pd(src, index, sizeof(double)); + } + + // gather: handmade conversions + template = 0> + inline batch gather(batch const&, double const* src, + batch const& index, + requires_arch) noexcept + { + const batch low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double))); + const batch high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double))); + return detail::merge_sse(_mm256_cvtpd_ps(low.data), _mm256_cvtpd_ps(high.data)); + } + + template = 0> + inline batch gather(batch const&, double const* src, + batch const& index, + requires_arch) noexcept + { + const batch low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double))); + const batch high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double))); + return detail::merge_sse(_mm256_cvtpd_epi32(low.data), _mm256_cvtpd_epi32(high.data)); + } + + // lt + template ::value, void>::type> + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_cmpgt_epi8(other, self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_cmpgt_epi16(other, self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_cmpgt_epi32(other, self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_cmpgt_epi64(other, self); + } + else + { + return lt(self, other, avx {}); + } + } + else + { + return lt(self, other, avx {}); + } + } + + // load_complex + template + inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + using batch_type = batch; + batch_type real = _mm256_castpd_ps( + _mm256_permute4x64_pd( + _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))), + _MM_SHUFFLE(3, 1, 2, 0))); + batch_type imag = _mm256_castpd_ps( + _mm256_permute4x64_pd( + _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))), + _MM_SHUFFLE(3, 1, 2, 0))); + return { real, imag }; + } + template + inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + using batch_type = batch; + batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0)); + batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0)); + return { real, imag }; + } + // mask + template ::value, void>::type> + inline uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + uint64_t mask8 = 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self); + return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4) | (detail::mask_lut(mask8 >> 16) << 8) | (detail::mask_lut(mask8 >> 24) << 12); + } + else + { + return mask(self, avx {}); + } + } + + // max + template ::value, void>::type> + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_max_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_max_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_max_epi32(self, other); + } + else + { + return max(self, other, avx {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_max_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_max_epu16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_max_epu32(self, other); + } + else + { + return max(self, other, avx {}); + } + } + } + + // min + template ::value, void>::type> + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_min_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_min_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_min_epi32(self, other); + } + else + { + return min(self, other, avx {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_min_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_min_epu16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_min_epu32(self, other); + } + else + { + return min(self, other, avx {}); + } + } + } + + // mul + template ::value, void>::type> + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + __m256i mask_hi = _mm256_set1_epi32(0xFF00FF00); + __m256i res_lo = _mm256_mullo_epi16(self, other); + __m256i other_hi = _mm256_srli_epi16(other, 8); + __m256i self_hi = _mm256_and_si256(self, mask_hi); + __m256i res_hi = _mm256_mullo_epi16(self_hi, other_hi); + __m256i res = _mm256_blendv_epi8(res_lo, res_hi, mask_hi); + return res; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_mullo_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_mullo_epi32(self, other); + } + else + { + return mul(self, other, avx {}); + } + } + + // reduce_add + template ::value, void>::type> + inline T reduce_add(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + __m256i tmp1 = _mm256_hadd_epi32(self, self); + __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1); + __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1); + __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3); + return _mm_cvtsi128_si32(tmp4); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + __m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E); + __m256i tmp2 = _mm256_add_epi64(self, tmp1); + __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1); + __m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3); +#if defined(__x86_64__) + return _mm_cvtsi128_si64(res); +#else + __m128i m; + _mm_storel_epi64(&m, res); + int64_t i; + std::memcpy(&i, &m, sizeof(i)); + return i; +#endif + } + else + { + return reduce_add(self, avx {}); + } + } + + // sadd + template ::value, void>::type> + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_adds_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_adds_epi16(self, other); + } + else + { + return sadd(self, other, avx {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_adds_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_adds_epu16(self, other); + } + else + { + return sadd(self, other, avx {}); + } + } + } + + // select + template ::value, void>::type> + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_blendv_epi8(false_br, true_br, cond); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_blendv_epi8(false_br, true_br, cond); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_blendv_epi8(false_br, true_br, cond); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_blendv_epi8(false_br, true_br, cond); + } + else + { + return select(cond, true_br, false_br, avx {}); + } + } + template ::value, void>::type> + inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + constexpr int mask = batch_bool_constant, Values...>::mask(); + // FIXME: for some reason mask here is not considered as an immediate, + // but it's okay for _mm256_blend_epi32 + // case 2: return _mm256_blend_epi16(false_br, true_br, mask); + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_blend_epi32(false_br, true_br, mask); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + constexpr int imask = detail::interleave(mask); + return _mm256_blend_epi32(false_br, true_br, imask); + } + else + { + return select(batch_bool { Values... }, true_br, false_br, avx2 {}); + } + } + + // slide_left + template + inline batch slide_left(batch const& x, requires_arch) noexcept + { + constexpr unsigned BitCount = N * 8; + if (BitCount == 0) + { + return x; + } + if (BitCount >= 256) + { + return batch(T(0)); + } + if (BitCount > 128) + { + constexpr unsigned M = (BitCount - 128) / 8; + auto y = _mm256_bslli_epi128(x, M); + return _mm256_permute2x128_si256(y, y, 0x28); + } + if (BitCount == 128) + { + return _mm256_permute2x128_si256(x, x, 0x28); + } + // shifting by [0, 128[ bits + constexpr unsigned M = BitCount / 8; + auto y = _mm256_bslli_epi128(x, M); + auto z = _mm256_bsrli_epi128(x, 16 - M); + auto w = _mm256_permute2x128_si256(z, z, 0x28); + return _mm256_or_si256(y, w); + } + + // slide_right + template + inline batch slide_right(batch const& x, requires_arch) noexcept + { + constexpr unsigned BitCount = N * 8; + if (BitCount == 0) + { + return x; + } + if (BitCount >= 256) + { + return batch(T(0)); + } + if (BitCount > 128) + { + constexpr unsigned M = (BitCount - 128) / 8; + auto y = _mm256_bsrli_epi128(x, M); + return _mm256_permute2x128_si256(y, y, 0x81); + } + if (BitCount == 128) + { + return _mm256_permute2x128_si256(x, x, 0x81); + } + // shifting by [0, 128[ bits + constexpr unsigned M = BitCount / 8; + auto y = _mm256_bsrli_epi128(x, M); + auto z = _mm256_bslli_epi128(x, 16 - M); + auto w = _mm256_permute2x128_si256(z, z, 0x81); + return _mm256_or_si256(y, w); + } + + // ssub + template ::value, void>::type> + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_subs_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_subs_epi16(self, other); + } + else + { + return ssub(self, other, avx {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_subs_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_subs_epu16(self, other); + } + else + { + return ssub(self, other, avx {}); + } + } + } + + // sub + template ::value, void>::type> + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_sub_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_sub_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_sub_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_sub_epi64(self, other); + } + else + { + return sub(self, other, avx {}); + } + } + + // swizzle + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch) noexcept + { + return _mm256_permutevar8x32_ps(self, (batch)mask); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3>, requires_arch) noexcept + { + constexpr auto mask = detail::shuffle(V0, V1, V2, V3); + return _mm256_permute4x64_pd(self, mask); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3>, requires_arch) noexcept + { + constexpr auto mask = detail::shuffle(V0, V1, V2, V3); + return _mm256_permute4x64_epi64(self, mask); + } + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3> mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, avx2 {})); + } + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch) noexcept + { + return _mm256_permutevar8x32_epi32(self, (batch)mask); + } + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, avx2 {})); + } + + // zip_hi + template ::value, void>::type> + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + auto lo = _mm256_unpacklo_epi8(self, other); + auto hi = _mm256_unpackhi_epi8(self, other); + return _mm256_permute2f128_si256(lo, hi, 0x31); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + auto lo = _mm256_unpacklo_epi16(self, other); + auto hi = _mm256_unpackhi_epi16(self, other); + return _mm256_permute2f128_si256(lo, hi, 0x31); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + auto lo = _mm256_unpacklo_epi32(self, other); + auto hi = _mm256_unpackhi_epi32(self, other); + return _mm256_permute2f128_si256(lo, hi, 0x31); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + auto lo = _mm256_unpacklo_epi64(self, other); + auto hi = _mm256_unpackhi_epi64(self, other); + return _mm256_permute2f128_si256(lo, hi, 0x31); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + + // zip_lo + template ::value, void>::type> + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + auto lo = _mm256_unpacklo_epi8(self, other); + auto hi = _mm256_unpackhi_epi8(self, other); + return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + auto lo = _mm256_unpacklo_epi16(self, other); + auto hi = _mm256_unpackhi_epi16(self, other); + return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + auto lo = _mm256_unpacklo_epi32(self, other); + auto hi = _mm256_unpackhi_epi32(self, other); + return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + auto lo = _mm256_unpacklo_epi64(self, other); + auto hi = _mm256_unpackhi_epi64(self, other); + return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + } +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp new file mode 100644 index 0000000000..77182e1ef2 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp @@ -0,0 +1,627 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512BW_HPP +#define XSIMD_AVX512BW_HPP + +#include +#include + +#include "../types/xsimd_avx512bw_register.hpp" + +namespace xsimd +{ + + namespace kernel + { + using namespace types; + + namespace detail + { + template + inline batch_bool compare_int_avx512bw(batch const& self, batch const& other) noexcept + { + using register_type = typename batch_bool::register_type; + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return (register_type)_mm512_cmp_epi8_mask(self, other, Cmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return (register_type)_mm512_cmp_epi16_mask(self, other, Cmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return (register_type)_mm512_cmp_epu8_mask(self, other, Cmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return (register_type)_mm512_cmp_epu16_mask(self, other, Cmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp); + } + } + } + } + + // abs + template ::value, void>::type> + inline batch abs(batch const& self, requires_arch) noexcept + { + if (std::is_unsigned::value) + { + return self; + } + + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm512_abs_epi8(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_abs_epi16(self); + } + else + { + return abs(self, avx512dq {}); + } + } + + // add + template ::value, void>::type> + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm512_add_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_add_epi16(self, other); + } + else + { + return add(self, other, avx512dq {}); + } + } + + // bitwise_lshift + template ::value, void>::type> + inline batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept + { +#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) + XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_sllv_epi16(self, _mm512_set1_epi16(other)); +#else + XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_slli_epi16(self, other); +#endif + } + else + { + return bitwise_lshift(self, other, avx512dq {}); + } + } + + // bitwise_rshift + template ::value, void>::type> + inline batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + __m512i sign_mask = _mm512_set1_epi16((0xFF00 >> other) & 0x00FF); + __m512i zeros = _mm512_setzero_si512(); + __mmask64 cmp_is_negative_mask = _mm512_cmpgt_epi8_mask(zeros, self); + __m512i cmp_sign_mask = _mm512_mask_blend_epi8(cmp_is_negative_mask, zeros, sign_mask); +#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) + __m512i res = _mm512_srav_epi16(self, _mm512_set1_epi16(other)); +#else + __m512i res = _mm512_srai_epi16(self, other); +#endif + return _mm512_or_si512(cmp_sign_mask, _mm512_andnot_si512(sign_mask, res)); +#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_srav_epi16(self, _mm512_set1_epi16(other)); +#else + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_srai_epi16(self, other); +#endif + } + else + { + return bitwise_rshift(self, other, avx512dq {}); + } + } + else + { +#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) + XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_srlv_epi16(self, _mm512_set1_epi16(other)); +#else + XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_srli_epi16(self, other); +#endif + } + else + { + return bitwise_rshift(self, other, avx512dq {}); + } + } + } + + // eq + template ::value, void>::type> + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512bw(self, other); + } + + // ge + template ::value, void>::type> + inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512bw(self, other); + } + + // gt + template ::value, void>::type> + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512bw(self, other); + } + + // le + template ::value, void>::type> + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512bw(self, other); + } + + // lt + template ::value, void>::type> + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512bw(self, other); + } + + // max + template ::value, void>::type> + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm512_max_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_max_epi16(self, other); + } + else + { + return max(self, other, avx512dq {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm512_max_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_max_epu16(self, other); + } + else + { + return max(self, other, avx512dq {}); + } + } + } + + // min + template ::value, void>::type> + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm512_min_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_min_epi16(self, other); + } + else + { + return min(self, other, avx512dq {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm512_min_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_min_epu16(self, other); + } + else + { + return min(self, other, avx512dq {}); + } + } + } + + // mul + template ::value, void>::type> + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + __m512i upper = _mm512_and_si512(_mm512_mullo_epi16(self, other), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8)); + __m512i lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(self, 8), _mm512_srli_epi16(other, 8)), 8); + return _mm512_or_si512(upper, lower); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_mullo_epi16(self, other); + } + else + { + return mul(self, other, avx512dq {}); + } + } + + // neq + template ::value, void>::type> + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512bw(self, other); + } + + // sadd + template ::value, void>::type> + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm512_adds_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_adds_epi16(self, other); + } + else + { + return sadd(self, other, avx512dq {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm512_adds_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_adds_epu16(self, other); + } + else + { + return sadd(self, other, avx512dq {}); + } + } + } + + // select + template ::value, void>::type> + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm512_mask_blend_epi8(cond, false_br.data, true_br.data); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_mask_blend_epi16(cond, false_br.data, true_br.data); + } + else + { + return select(cond, true_br, false_br, avx512dq {}); + } + } + + // slide_left + namespace detail + { + template + constexpr std::array make_slide_perm_hi(::xsimd::detail::index_sequence) + { + return { (Is == 0 ? 8 : Is - 1)... }; + } + + template + constexpr std::array make_slide_left_pattern(::xsimd::detail::index_sequence) + { + return { (Is >= N ? Is - N : 0)... }; + } + template + constexpr std::array make_slide_left_mask(::xsimd::detail::index_sequence) + { + return { (Is >= N ? 0xFFFF : 0x0000)... }; + } + } + + template + inline batch slide_left(batch const& x, requires_arch) noexcept + { + constexpr unsigned BitCount = N * 8; + if (BitCount == 0) + { + return x; + } + if (BitCount >= 512) + { + return batch(T(0)); + } + batch xx; + if (N & 1) + { + alignas(A::alignment()) uint64_t buffer[8]; + _mm512_store_epi64(&buffer[0], x); + for (int i = 7; i > 0; --i) + buffer[i] = (buffer[i] << 8) | (buffer[i - 1] >> 56); + buffer[0] = buffer[0] << 8; + xx = _mm512_load_epi64(&buffer[0]); + + alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_hi(::xsimd::detail::make_index_sequence<512 / 64>()); + __m512i xl = _mm512_slli_epi64(x, 8); + __m512i xr = _mm512_srli_epi64(x, 56); + xr = _mm512_permutex2var_epi64(xr, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512()); + xx = _mm512_or_si512(xr, xl); + if (N == 1) + return xx; + } + else + { + xx = x; + } + alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_pattern(::xsimd::detail::make_index_sequence<512 / 16>()); + alignas(A::alignment()) auto slide_mask = detail::make_slide_left_mask(::xsimd::detail::make_index_sequence<512 / 16>()); + return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data())); + } + + // slide_right + namespace detail + { + template + constexpr std::array make_slide_perm_low(::xsimd::detail::index_sequence) + { + return { (Is + 1)... }; + } + + template + constexpr std::array make_slide_right_pattern(::xsimd::detail::index_sequence) + { + return { (Is < (32 - N) ? Is + N : 0)... }; + } + template + constexpr std::array make_slide_right_mask(::xsimd::detail::index_sequence) + { + return { (Is < 32 - N ? 0xFFFF : 0x0000)... }; + } + } + template + inline batch slide_right(batch const& x, requires_arch) noexcept + { + constexpr unsigned BitCount = N * 8; + if (BitCount == 0) + { + return x; + } + if (BitCount >= 512) + { + return batch(T(0)); + } + batch xx; + if (N & 1) + { + alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_low(::xsimd::detail::make_index_sequence<512 / 64>()); + __m512i xr = _mm512_srli_epi64(x, 8); + __m512i xl = _mm512_slli_epi64(x, 56); + xl = _mm512_permutex2var_epi64(xl, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512()); + xx = _mm512_or_si512(xr, xl); + if (N == 1) + return xx; + } + else + { + xx = x; + } + alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_pattern(::xsimd::detail::make_index_sequence<512 / 16>()); + alignas(A::alignment()) auto slide_mask = detail::make_slide_right_mask(::xsimd::detail::make_index_sequence<512 / 16>()); + return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data())); + } + + // ssub + template ::value, void>::type> + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm512_subs_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_subs_epi16(self, other); + } + else + { + return ssub(self, other, avx512dq {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm512_subs_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_subs_epu16(self, other); + } + else + { + return ssub(self, other, avx512dq {}); + } + } + } + + // sub + template ::value, void>::type> + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm512_sub_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_sub_epi16(self, other); + } + else + { + return sub(self, other, avx512dq {}); + } + } + + // swizzle + + template + inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept + { + return _mm512_permutexvar_epi16((batch)mask, self); + } + + template + inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512bw {})); + } + + template + inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept + { + return _mm512_shuffle_epi8(self, (batch)mask); + } + + template + inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512bw {})); + } + + // zip_hi + template ::value, void>::type> + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + __m512i lo, hi; + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + lo = _mm512_unpacklo_epi8(self, other); + hi = _mm512_unpackhi_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + lo = _mm512_unpacklo_epi16(self, other); + hi = _mm512_unpackhi_epi16(self, other); + } + else + { + return zip_hi(self, other, avx512f {}); + } + return _mm512_inserti32x4( + _mm512_inserti32x4( + _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0), + _mm512_extracti32x4_epi32(lo, 3), + 2), + _mm512_extracti32x4_epi32(hi, 2), + 1); + } + + // zip_lo + template ::value, void>::type> + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + __m512i lo, hi; + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + lo = _mm512_unpacklo_epi8(self, other); + hi = _mm512_unpackhi_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + lo = _mm512_unpacklo_epi16(self, other); + hi = _mm512_unpackhi_epi16(self, other); + } + else + { + return zip_lo(self, other, avx512f {}); + } + return _mm512_inserti32x4( + _mm512_inserti32x4( + _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1), + _mm512_extracti32x4_epi32(hi, 1), + 3), + _mm512_extracti32x4_epi32(lo, 1), + 2); + } + } +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp new file mode 100644 index 0000000000..95f3f1df8f --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp @@ -0,0 +1,28 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512CD_HPP +#define XSIMD_AVX512CD_HPP + +#include "../types/xsimd_avx512cd_register.hpp" + +namespace xsimd +{ + + namespace kernel + { + // Nothing there yet. + + } + +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp new file mode 100644 index 0000000000..7840ea8fc5 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp @@ -0,0 +1,212 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512_DQHPP +#define XSIMD_AVX512_D_HPP + +#include "../types/xsimd_avx512dq_register.hpp" + +namespace xsimd +{ + + namespace kernel + { + using namespace types; + + // bitwise_and + template + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_and_ps(self, other); + } + template + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_and_pd(self, other); + } + + // bitwise_andnot + template + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_andnot_ps(other, self); + } + template + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_andnot_pd(other, self); + } + + // bitwise_not + template + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1))); + } + template + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1))); + } + + // bitwise_or + template + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_or_ps(self, other); + } + template + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_or_pd(self, other); + } + + template + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data | other.data); + } + + // bitwise_xor + template + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_xor_ps(self, other); + } + template + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_xor_pd(self, other); + } + + // haddp + template + inline batch haddp(batch const* row, requires_arch) noexcept + { + // The following folds over the vector once: + // tmp1 = [a0..8, b0..8] + // tmp2 = [a8..f, b8..f] +#define XSIMD_AVX512_HADDP_STEP1(I, a, b) \ + batch res##I; \ + { \ + auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ + auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ + res##I = _mm512_add_ps(tmp1, tmp2); \ + } + + XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]); + XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]); + XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]); + XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]); + XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]); + XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]); + XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]); + XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]); + +#undef XSIMD_AVX512_HADDP_STEP1 + + // The following flds the code and shuffles so that hadd_ps produces the correct result + // tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3) + // tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4) + // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ... +#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \ + batch halfx##I; \ + { \ + auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ + auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ + \ + auto resx1 = _mm512_add_ps(tmp1, tmp2); \ + \ + auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ + auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ + \ + auto resx2 = _mm512_add_ps(tmp3, tmp4); \ + \ + auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \ + auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \ + \ + auto resx3 = _mm512_add_ps(tmp5, tmp6); \ + \ + halfx##I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0), \ + _mm512_extractf32x8_ps(resx3, 1)); \ + } + + XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3); + XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7); + +#undef XSIMD_AVX512_HADDP_STEP2 + + auto concat = _mm512_castps256_ps512(halfx0); + concat = _mm512_insertf32x8(concat, halfx1, 1); + return concat; + } + + // ldexp + template + inline batch ldexp(const batch& self, const batch, A>& other, requires_arch) noexcept + { + return _mm512_scalef_pd(self, _mm512_cvtepi64_pd(other)); + } + + // mul + template + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_mullo_epi64(self, other); + } + + template + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_mullo_epi64(self, other); + } + + // nearbyint_as_int + template + inline batch nearbyint_as_int(batch const& self, + requires_arch) noexcept + { + return _mm512_cvtpd_epi64(self); + } + + // reduce_add + template + inline float reduce_add(batch const& rhs, requires_arch) noexcept + { + __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1); + __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0); + __m256 res1 = _mm256_add_ps(tmp1, tmp2); + return reduce_add(batch(res1), avx2 {}); + } + + // convert + namespace detail + { + template + inline batch fast_cast(batch const& x, batch const&, requires_arch) noexcept + { + return _mm512_cvtepi64_pd(self); + } + + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_cvttpd_epi64(self); + } + + } + + } + +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp new file mode 100644 index 0000000000..7eea894137 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp @@ -0,0 +1,2028 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512F_HPP +#define XSIMD_AVX512F_HPP + +#include +#include +#include + +#include "../types/xsimd_avx512f_register.hpp" + +namespace xsimd +{ + + namespace kernel + { + using namespace types; + + namespace detail + { + inline void split_avx512(__m512 val, __m256& low, __m256& high) noexcept + { + low = _mm512_castps512_ps256(val); + high = _mm512_extractf32x8_ps(val, 1); + } + inline void split_avx512(__m512d val, __m256d& low, __m256d& high) noexcept + { + low = _mm512_castpd512_pd256(val); + high = _mm512_extractf64x4_pd(val, 1); + } + inline void split_avx512(__m512i val, __m256i& low, __m256i& high) noexcept + { + low = _mm512_castsi512_si256(val); + high = _mm512_extracti64x4_epi64(val, 1); + } + inline __m512i merge_avx(__m256i low, __m256i high) noexcept + { + return _mm512_inserti64x4(_mm512_castsi256_si512(low), high, 1); + } + inline __m512 merge_avx(__m256 low, __m256 high) noexcept + { + return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castpd256_pd512(_mm256_castps_pd(low)), _mm256_castps_pd(high), 1)); + } + inline __m512d merge_avx(__m256d low, __m256d high) noexcept + { + return _mm512_insertf64x4(_mm512_castpd256_pd512(low), high, 1); + } + template + __m512i fwd_to_avx(F f, __m512i self) + { + __m256i self_low, self_high; + split_avx512(self, self_low, self_high); + __m256i res_low = f(self_low); + __m256i res_high = f(self_high); + return merge_avx(res_low, res_high); + } + template + __m512i fwd_to_avx(F f, __m512i self, __m512i other) + { + __m256i self_low, self_high, other_low, other_high; + split_avx512(self, self_low, self_high); + split_avx512(other, other_low, other_high); + __m256i res_low = f(self_low, other_low); + __m256i res_high = f(self_high, other_high); + return merge_avx(res_low, res_high); + } + template + __m512i fwd_to_avx(F f, __m512i self, int32_t other) + { + __m256i self_low, self_high; + split_avx512(self, self_low, self_high); + __m256i res_low = f(self_low, other); + __m256i res_high = f(self_high, other); + return merge_avx(res_low, res_high); + } + } + namespace detail + { + + inline uint32_t morton(uint16_t x, uint16_t y) noexcept + { + + static const unsigned short MortonTable256[256] = { + 0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015, + 0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055, + 0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115, + 0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155, + 0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415, + 0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455, + 0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515, + 0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555, + 0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015, + 0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055, + 0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115, + 0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155, + 0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415, + 0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455, + 0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515, + 0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555, + 0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015, + 0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055, + 0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115, + 0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155, + 0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415, + 0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455, + 0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515, + 0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555, + 0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015, + 0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055, + 0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115, + 0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155, + 0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415, + 0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455, + 0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515, + 0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555 + }; + + uint32_t z = MortonTable256[y >> 8] << 17 | MortonTable256[x >> 8] << 16 | MortonTable256[y & 0xFF] << 1 | MortonTable256[x & 0xFF]; + return z; + } + + template + inline batch_bool compare_int_avx512f(batch const& self, batch const& other) noexcept + { + using register_type = typename batch_bool::register_type; + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + // shifting to take sign into account + uint64_t mask_low0 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x000000FF)) << 24, + (batch(other.data) & batch(0x000000FF)) << 24, + Cmp); + uint64_t mask_low1 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x0000FF00)) << 16, + (batch(other.data) & batch(0x0000FF00)) << 16, + Cmp); + uint64_t mask_high0 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x00FF0000)) << 8, + (batch(other.data) & batch(0x00FF0000)) << 8, + Cmp); + uint64_t mask_high1 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0xFF000000)), + (batch(other.data) & batch(0xFF000000)), + Cmp); + uint64_t mask = 0; + for (unsigned i = 0; i < 16; ++i) + { + mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); + mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); + mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); + mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); + } + return (register_type)mask; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + // shifting to take sign into account + uint16_t mask_low = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x0000FFFF)) << 16, + (batch(other.data) & batch(0x0000FFFF)) << 16, + Cmp); + uint16_t mask_high = _mm512_cmp_epi32_mask((batch(self.data) & batch(0xFFFF0000)), + (batch(other.data) & batch(0xFFFF0000)), + Cmp); + return static_cast(morton(mask_low, mask_high)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + uint64_t mask_low0 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x000000FF)), (batch(other.data) & batch(0x000000FF)), Cmp); + uint64_t mask_low1 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x0000FF00)), (batch(other.data) & batch(0x0000FF00)), Cmp); + uint64_t mask_high0 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x00FF0000)), (batch(other.data) & batch(0x00FF0000)), Cmp); + uint64_t mask_high1 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0xFF000000)), (batch(other.data) & batch(0xFF000000)), Cmp); + uint64_t mask = 0; + for (unsigned i = 0; i < 16; ++i) + { + mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); + mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); + mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); + mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); + } + return (register_type)mask; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + uint16_t mask_low = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x0000FFFF)), (batch(other.data) & batch(0x0000FFFF)), Cmp); + uint16_t mask_high = _mm512_cmp_epu32_mask((batch(self.data) & batch(0xFFFF0000)), (batch(other.data) & batch(0xFFFF0000)), Cmp); + return static_cast(morton(mask_low, mask_high)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp); + } + } + } + } + + // abs + template + inline batch abs(batch const& self, requires_arch) noexcept + { + __m512 self_asf = (__m512)self; + __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asf); + __m512i res_asi = _mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF), self_asi); + return *reinterpret_cast<__m512*>(&res_asi); + } + template + inline batch abs(batch const& self, requires_arch) noexcept + { + __m512d self_asd = (__m512d)self; + __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asd); + __m512i res_asi = _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF), + self_asi); + return *reinterpret_cast<__m512d*>(&res_asi); + } + template ::value, void>::type> + inline batch abs(batch const& self, requires_arch) noexcept + { + if (std::is_unsigned::value) + { + return self; + } + + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return detail::fwd_to_avx([](__m256i s) noexcept + { return abs(batch(s)); }, + self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return detail::fwd_to_avx([](__m256i s) noexcept + { return abs(batch(s)); }, + self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm512_abs_epi32(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm512_abs_epi64(self); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + + // add + template ::value, void>::type> + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept + { return add(batch(s), batch(o)); }, + self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept + { return add(batch(s), batch(o)); }, + self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm512_add_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm512_add_epi64(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_add_ps(self, other); + } + template + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_add_pd(self, other); + } + + // all + template + inline bool all(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return self.data == register_type(-1); + } + + // any + template + inline bool any(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return self.data != register_type(0); + } + + // batch_bool_cast + template + inline batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept + { + return self.data; + } + + // bitwise_and + template + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { +#if defined(_MSC_VER) + return _mm512_and_ps(self, other); +#else + return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); +#endif + } + template + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); + } + + template ::value, void>::type> + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_and_si512(self, other); + } + + template + inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data & other.data); + } + + // bitwise_andnot + template + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(other), _mm512_castps_si512(self))); + } + template + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(other), _mm512_castpd_si512(self))); + } + + template ::value, void>::type> + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_andnot_si512(other, self); + } + + template + inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data & ~other.data); + } + + // bitwise_lshift + template ::value, void>::type> + inline batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { +#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) + __m512i tmp = _mm512_sllv_epi32(self, _mm512_set1_epi32(other)); +#else + __m512i tmp = _mm512_slli_epi32(self, other); +#endif + return _mm512_and_si512(_mm512_set1_epi8(0xFF << other), tmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept + { return bitwise_lshift(batch(s), o, avx2 {}); }, + self, other); +#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm512_sllv_epi32(self, _mm512_set1_epi32(other)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm512_sllv_epi64(self, _mm512_set1_epi64(other)); +#else + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm512_slli_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm512_slli_epi64(self, other); +#endif + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + + // bitwise_not + template ::value, void>::type> + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm512_xor_si512(self, _mm512_set1_epi32(-1)); + } + template + inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(~self.data); + } + + template + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_set1_epi32(-1))); + } + template + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_set1_epi32(-1))); + } + + // bitwise_or + template + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); + } + template + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_castsi512_pd(_mm512_or_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); + } + + template + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data | other.data); + } + + template ::value, void>::type> + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_or_si512(self, other); + } + + // bitwise_rshift + template ::value, void>::type> + inline batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept + { + if (std::is_signed::value) + { +#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm512_srav_epi32(self, _mm512_set1_epi32(other)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm512_srav_epi64(self, _mm512_set1_epi64(other)); +#else + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm512_srai_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm512_srai_epi64(self, other); +#endif + } + else + { + return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept + { return bitwise_rshift(batch(s), o, avx2 {}); }, + self, other); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { +#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) + __m512i tmp = _mm512_srlv_epi32(self, _mm512_set1_epi32(other)); +#else + __m512i tmp = _mm512_srli_epi32(self, other); +#endif + return _mm512_and_si512(_mm512_set1_epi8(0xFF >> other), tmp); +#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm512_srlv_epi32(self, _mm512_set1_epi32(other)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm512_srlv_epi64(self, _mm512_set1_epi64(other)); +#else + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm512_srli_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm512_srli_epi64(self, other); +#endif + } + else + { + return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept + { return bitwise_rshift(batch(s), o, avx2 {}); }, + self, other); + } + } + } + + // bitwise_xor + template + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); + } + template + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); + } + + template + inline batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data | other.data); + } + + template ::value, void>::type> + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_xor_si512(self, other); + } + + // bitwise_cast + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_castsi512_ps(self); + } + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_castsi512_pd(self); + } + template ::type>::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return batch(self.data); + } + template + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_castps_pd(self); + } + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_castps_si512(self); + } + template + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_castpd_ps(self); + } + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_castpd_si512(self); + } + + // broadcast + template ::value, void>::type> + inline batch broadcast(T val, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm512_set1_epi8(val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_set1_epi16(val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm512_set1_epi32(val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm512_set1_epi64(val); + } + else + { + assert(false && "unsupported"); + return {}; + } + } + template + inline batch broadcast(float val, requires_arch) noexcept + { + return _mm512_set1_ps(val); + } + template + batch inline broadcast(double val, requires_arch) noexcept + { + return _mm512_set1_pd(val); + } + + // ceil + template + inline batch ceil(batch const& self, requires_arch) noexcept + { + return _mm512_roundscale_ps(self, _MM_FROUND_TO_POS_INF); + } + template + inline batch ceil(batch const& self, requires_arch) noexcept + { + return _mm512_roundscale_pd(self, _MM_FROUND_TO_POS_INF); + } + + // convert + namespace detail + { + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_cvtepi32_ps(self); + } + + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_cvttps_epi32(self); + } + + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_cvtepu32_ps(self); + } + + template + batch fast_cast(batch const& self, batch const&, requires_arch) + { + return _mm512_cvttps_epu32(self); + } + } + + namespace detail + { + // complex_low + template + inline batch complex_low(batch, A> const& self, requires_arch) noexcept + { + __m512i idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + return _mm512_permutex2var_ps(self.real(), idx, self.imag()); + } + template + inline batch complex_low(batch, A> const& self, requires_arch) noexcept + { + __m512i idx = _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11); + return _mm512_permutex2var_pd(self.real(), idx, self.imag()); + } + + // complex_high + template + inline batch complex_high(batch, A> const& self, requires_arch) noexcept + { + __m512i idx = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + return _mm512_permutex2var_ps(self.real(), idx, self.imag()); + } + template + inline batch complex_high(batch, A> const& self, requires_arch) noexcept + { + __m512i idx = _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15); + return _mm512_permutex2var_pd(self.real(), idx, self.imag()); + } + } + + // div + template + inline batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_div_ps(self, other); + } + template + inline batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_div_pd(self, other); + } + + // eq + template + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_ps_mask(self, other, _CMP_EQ_OQ); + } + template + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_pd_mask(self, other, _CMP_EQ_OQ); + } + + template ::value, void>::type> + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512f(self, other); + } + template + inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(~self.data ^ other.data); + } + + // floor + template + inline batch floor(batch const& self, requires_arch) noexcept + { + return _mm512_roundscale_ps(self, _MM_FROUND_TO_NEG_INF); + } + template + inline batch floor(batch const& self, requires_arch) noexcept + { + return _mm512_roundscale_pd(self, _MM_FROUND_TO_NEG_INF); + } + + // fnma + template + inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm512_fnmadd_ps(x, y, z); + } + + template + inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm512_fnmadd_pd(x, y, z); + } + + // fma + template + inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm512_fmadd_ps(x, y, z); + } + + template + inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm512_fmadd_pd(x, y, z); + } + + // fms + template + inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm512_fmsub_ps(x, y, z); + } + + template + inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm512_fmsub_pd(x, y, z); + } + + // from bool + template + inline batch from_bool(batch_bool const& self, requires_arch) noexcept + { + return select(self, batch(1), batch(0)); + } + + // from_mask + template + inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + return static_cast::register_type>(mask); + } + + // gather + template = 0, detail::enable_sized_integral_t = 0> + inline batch gather(batch const&, T const* src, batch const& index, + kernel::requires_arch) noexcept + { + return _mm512_i32gather_epi32(index, static_cast(src), sizeof(T)); + } + + template = 0, detail::enable_sized_integral_t = 0> + inline batch gather(batch const&, T const* src, batch const& index, + kernel::requires_arch) noexcept + { + return _mm512_i64gather_epi64(index, static_cast(src), sizeof(T)); + } + + template = 0> + inline batch gather(batch const&, float const* src, + batch const& index, + kernel::requires_arch) noexcept + { + return _mm512_i32gather_ps(index, src, sizeof(float)); + } + + template = 0> + inline batch + gather(batch const&, double const* src, batch const& index, + kernel::requires_arch) noexcept + { + return _mm512_i64gather_pd(index, src, sizeof(double)); + } + + // gather: handmade conversions + template = 0> + inline batch gather(batch const&, double const* src, + batch const& index, + requires_arch) noexcept + { + const batch low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double))); + const batch high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double))); + return detail::merge_avx(_mm512_cvtpd_ps(low.data), _mm512_cvtpd_ps(high.data)); + } + + template = 0> + inline batch gather(batch const&, double const* src, + batch const& index, + requires_arch) noexcept + { + const batch low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double))); + const batch high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double))); + return detail::merge_avx(_mm512_cvtpd_epi32(low.data), _mm512_cvtpd_epi32(high.data)); + } + + // ge + template + inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_ps_mask(self, other, _CMP_GE_OQ); + } + template + inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_pd_mask(self, other, _CMP_GE_OQ); + } + template ::value, void>::type> + inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512f(self, other); + } + + // gt + template + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_ps_mask(self, other, _CMP_GT_OQ); + } + template + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_pd_mask(self, other, _CMP_GT_OQ); + } + template ::value, void>::type> + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512f(self, other); + } + + // haddp + template + inline batch haddp(batch const* row, requires_arch) noexcept + { + // The following folds over the vector once: + // tmp1 = [a0..8, b0..8] + // tmp2 = [a8..f, b8..f] +#define XSIMD_AVX512_HADDP_STEP1(I, a, b) \ + batch res##I; \ + { \ + auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ + auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ + res##I = _mm512_add_ps(tmp1, tmp2); \ + } + + XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]); + XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]); + XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]); + XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]); + XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]); + XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]); + XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]); + XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]); + +#undef XSIMD_AVX512_HADDP_STEP1 + + // The following flds the code and shuffles so that hadd_ps produces the correct result + // tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3) + // tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4) + // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ... +#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \ + batch halfx##I; \ + { \ + auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ + auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ + \ + auto resx1 = _mm512_add_ps(tmp1, tmp2); \ + \ + auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ + auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ + \ + auto resx2 = _mm512_add_ps(tmp3, tmp4); \ + \ + auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \ + auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \ + \ + auto resx3 = _mm512_add_ps(tmp5, tmp6); \ + \ + halfx##I = _mm256_hadd_ps(_mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 0)), _mm512_extractf32x4_ps(resx3, 1), 1), \ + _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 2)), _mm512_extractf32x4_ps(resx3, 3), 1)); \ + } + + XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3); + XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7); + +#undef XSIMD_AVX512_HADDP_STEP2 + + auto concat = _mm512_castps256_ps512(halfx0); + concat = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(concat), _mm256_castps_pd(halfx1), 1)); + return concat; + } + + template + inline batch haddp(batch const* row, requires_arch) noexcept + { +#define step1(I, a, b) \ + batch res##I; \ + { \ + auto tmp1 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ + auto tmp2 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ + res##I = _mm512_add_pd(tmp1, tmp2); \ + } + + step1(1, row[0], row[2]); + step1(2, row[4], row[6]); + step1(3, row[1], row[3]); + step1(4, row[5], row[7]); + +#undef step1 + + auto tmp5 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(2, 0, 2, 0)); + auto tmp6 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(3, 1, 3, 1)); + + auto resx1 = _mm512_add_pd(tmp5, tmp6); + + auto tmp7 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(2, 0, 2, 0)); + auto tmp8 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(3, 1, 3, 1)); + + auto resx2 = _mm512_add_pd(tmp7, tmp8); + + auto tmpx = _mm512_shuffle_pd(resx1, resx2, 0b00000000); + auto tmpy = _mm512_shuffle_pd(resx1, resx2, 0b11111111); + + return _mm512_add_pd(tmpx, tmpy); + } + + // isnan + template + inline batch_bool isnan(batch const& self, requires_arch) noexcept + { + return _mm512_cmp_ps_mask(self, self, _CMP_UNORD_Q); + } + template + inline batch_bool isnan(batch const& self, requires_arch) noexcept + { + return _mm512_cmp_pd_mask(self, self, _CMP_UNORD_Q); + } + + // ldexp + template + inline batch ldexp(const batch& self, const batch, A>& other, requires_arch) noexcept + { + return _mm512_scalef_ps(self, _mm512_cvtepi32_ps(other)); + } + + template + inline batch ldexp(const batch& self, const batch, A>& other, requires_arch) noexcept + { + // FIXME: potential data loss here when converting other elements to + // int32 before converting them back to double. + __m512d adjusted_index = _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(other)); + return _mm512_scalef_pd(self, adjusted_index); + } + + // le + template + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_ps_mask(self, other, _CMP_LE_OQ); + } + template + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_pd_mask(self, other, _CMP_LE_OQ); + } + template ::value, void>::type> + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512f(self, other); + } + + // load_aligned + template ::value, void>::type> + inline batch load_aligned(T const* mem, convert, requires_arch) noexcept + { + return _mm512_load_si512((__m512i const*)mem); + } + template + inline batch load_aligned(float const* mem, convert, requires_arch) noexcept + { + return _mm512_load_ps(mem); + } + template + inline batch load_aligned(double const* mem, convert, requires_arch) noexcept + { + return _mm512_load_pd(mem); + } + + // load_complex + namespace detail + { + template + inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + __m512i real_idx = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); + __m512i imag_idx = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); + auto real = _mm512_permutex2var_ps(hi, real_idx, lo); + auto imag = _mm512_permutex2var_ps(hi, imag_idx, lo); + return { real, imag }; + } + template + inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + __m512i real_idx = _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14); + __m512i imag_idx = _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15); + auto real = _mm512_permutex2var_pd(hi, real_idx, lo); + auto imag = _mm512_permutex2var_pd(hi, imag_idx, lo); + return { real, imag }; + } + } + + // load_unaligned + template ::value, void>::type> + inline batch load_unaligned(T const* mem, convert, requires_arch) noexcept + { + return _mm512_loadu_si512((__m512i const*)mem); + } + template + inline batch load_unaligned(float const* mem, convert, requires_arch) noexcept + { + return _mm512_loadu_ps(mem); + } + template + inline batch load_unaligned(double const* mem, convert, requires_arch) noexcept + { + return _mm512_loadu_pd(mem); + } + + // lt + template + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_ps_mask(self, other, _CMP_LT_OQ); + } + template + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_pd_mask(self, other, _CMP_LT_OQ); + } + + template ::value, void>::type> + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512f(self, other); + } + + // mask + template + inline uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + return self.data; + } + + // max + template + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_max_ps(self, other); + } + template + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_max_pd(self, other); + } + template ::value, void>::type> + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm512_max_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm512_max_epi64(self, other); + } + else + { + return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept + { return max(batch(s), batch(o)); }, + self, other); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm512_max_epu32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm512_max_epu64(self, other); + } + else + { + return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept + { return max(batch(s), batch(o)); }, + self, other); + } + } + } + + // min + template + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_min_ps(self, other); + } + template + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_min_pd(self, other); + } + template ::value, void>::type> + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm512_min_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm512_min_epi64(self, other); + } + else + { + return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept + { return min(batch(s), batch(o)); }, + self, other); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm512_min_epu32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm512_min_epu64(self, other); + } + else + { + return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept + { return min(batch(s), batch(o)); }, + self, other); + } + } + } + + // mul + template + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_mul_ps(self, other); + } + template + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_mul_pd(self, other); + } + template ::value, void>::type> + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm512_mullo_epi32(self, other); + } + else + { + return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept + { return mul(batch(s), batch(o)); }, + self, other); + } + } + + // nearbyint + template + inline batch nearbyint(batch const& self, requires_arch) noexcept + { + return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION); + } + template + inline batch nearbyint(batch const& self, requires_arch) noexcept + { + return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION); + } + + // nearbyint_as_int + template + inline batch nearbyint_as_int(batch const& self, + requires_arch) noexcept + { + return _mm512_cvtps_epi32(self); + } + + // neg + template + inline batch neg(batch const& self, requires_arch) noexcept + { + return 0 - self; + } + + // neq + template + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_ps_mask(self, other, _CMP_NEQ_UQ); + } + template + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_pd_mask(self, other, _CMP_NEQ_UQ); + } + template ::value, void>::type> + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return ~(self == other); + } + + template + inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data ^ other.data); + } + + // reciprocal + template + inline batch + reciprocal(batch const& self, + kernel::requires_arch) noexcept + { + return _mm512_rcp14_ps(self); + } + + template + inline batch + reciprocal(batch const& self, + kernel::requires_arch) noexcept + { + return _mm512_rcp14_pd(self); + } + + // reduce_add + template + inline float reduce_add(batch const& rhs, requires_arch) noexcept + { + __m128 tmp1 = _mm512_extractf32x4_ps(rhs, 0); + __m128 tmp2 = _mm512_extractf32x4_ps(rhs, 1); + __m128 tmp3 = _mm512_extractf32x4_ps(rhs, 2); + __m128 tmp4 = _mm512_extractf32x4_ps(rhs, 3); + __m128 res1 = _mm_add_ps(tmp1, tmp2); + __m128 res2 = _mm_add_ps(tmp3, tmp4); + __m128 res3 = _mm_add_ps(res1, res2); + return reduce_add(batch(res3), sse4_2 {}); + } + template + inline double reduce_add(batch const& rhs, requires_arch) noexcept + { + __m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1); + __m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0); + __m256d res1 = _mm256_add_pd(tmp1, tmp2); + return reduce_add(batch(res1), avx2 {}); + } + template ::value, void>::type> + inline T reduce_add(batch const& self, requires_arch) noexcept + { + __m256i low, high; + detail::split_avx512(self, low, high); + batch blow(low), bhigh(high); + return reduce_add(blow, avx2 {}) + reduce_add(bhigh, avx2 {}); + } + + // reduce_max + template ::type> + inline T reduce_max(batch const& self, requires_arch) noexcept + { + constexpr batch_constant, 5, 6, 7, 8, 0, 0, 0, 0> mask; + batch step = _mm512_permutexvar_epi64((batch)mask, self); + batch acc = max(self, step); + __m256i low = _mm512_castsi512_si256(acc); + return reduce_max(batch(low)); + } + + // reduce_min + template ::type> + inline T reduce_min(batch const& self, requires_arch) noexcept + { + constexpr batch_constant, 5, 6, 7, 8, 0, 0, 0, 0> mask; + batch step = _mm512_permutexvar_epi64((batch)mask, self); + batch acc = min(self, step); + __m256i low = _mm512_castsi512_si256(acc); + return reduce_min(batch(low)); + } + + // rsqrt + template + inline batch rsqrt(batch const& val, requires_arch) noexcept + { + return _mm512_rsqrt14_ps(val); + } + template + inline batch rsqrt(batch const& val, requires_arch) noexcept + { + return _mm512_rsqrt14_pd(val); + } + + // sadd + template ::value, void>::type> + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + auto mask = other < 0; + auto self_pos_branch = min(std::numeric_limits::max() - other, self); + auto self_neg_branch = max(std::numeric_limits::min() - other, self); + return other + select(mask, self_neg_branch, self_pos_branch); + } + else + { + const auto diffmax = std::numeric_limits::max() - self; + const auto mindiff = min(diffmax, other); + return self + mindiff; + } + } + + // scatter + template ::value || std::is_same::value, void>::type> + inline void scatter(batch const& src, T* dst, + batch const& index, + kernel::requires_arch) noexcept + { + _mm512_i32scatter_epi32(dst, index, src, sizeof(T)); + } + + template ::value || std::is_same::value, void>::type> + inline void scatter(batch const& src, T* dst, + batch const& index, + kernel::requires_arch) noexcept + { + _mm512_i64scatter_epi64(dst, index, src, sizeof(T)); + } + + template + inline void scatter(batch const& src, float* dst, + batch const& index, + kernel::requires_arch) noexcept + { + _mm512_i32scatter_ps(dst, index, src, sizeof(float)); + } + + template + inline void scatter(batch const& src, double* dst, + batch const& index, + kernel::requires_arch) noexcept + { + _mm512_i64scatter_pd(dst, index, src, sizeof(double)); + } + + // select + template + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm512_mask_blend_ps(cond, false_br, true_br); + } + template + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm512_mask_blend_pd(cond, false_br, true_br); + } + + template ::value, void>::type> + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + alignas(avx2::alignment()) uint8_t buffer[64]; + // FIXME: ultra inefficient + for (int i = 0; i < 64; ++i) + buffer[i] = cond.data & (1ull << i) ? 0xFF : 0; + __m256i cond_low = batch::load_aligned(&buffer[0]); + __m256i cond_hi = batch::load_aligned(&buffer[32]); + + __m256i true_low, true_hi; + detail::split_avx512(true_br, true_low, true_hi); + + __m256i false_low, false_hi; + detail::split_avx512(false_br, false_low, false_hi); + + __m256i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), avx2 {}); + __m256i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), avx2 {}); + return detail::merge_avx(res_low, res_hi); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + __m256i cond_low = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data & 0xFFFF, _mm512_set1_epi32(~0)); + __m256i cond_hi = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data >> 16, _mm512_set1_epi32(~0)); + + __m256i true_low, true_hi; + detail::split_avx512(true_br, true_low, true_hi); + + __m256i false_low, false_hi; + detail::split_avx512(false_br, false_low, false_hi); + + __m256i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), avx2 {}); + __m256i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), avx2 {}); + return detail::merge_avx(res_low, res_hi); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm512_mask_blend_epi32(cond, false_br, true_br); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm512_mask_blend_epi64(cond, false_br, true_br); + } + else + { + assert(false && "unsupported arch/type combination"); + return {}; + } + } + + template ::value, void>::type> + inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return select(batch_bool { Values... }, true_br, false_br, avx512f {}); + } + + namespace detail + { + template + using enable_signed_integer_t = typename std::enable_if::value && std::is_signed::value, + int>::type; + + template + using enable_unsigned_integer_t = typename std::enable_if::value && std::is_unsigned::value, + int>::type; + } + + // set + template + inline batch set(batch const&, requires_arch, float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15) noexcept + { + return _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); + } + + template + inline batch set(batch const&, requires_arch, double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7) noexcept + { + return _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); + } + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept + { + return _mm512_set_epi64(v7, v6, v5, v4, v3, v2, v1, v0); + } + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept + { + return _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); + } + template = 0> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, + T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, + T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept + { +#if defined(__clang__) || __GNUC__ + return __extension__(__m512i)(__v32hi) { + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + }; +#else + return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); +#endif + } + + template = 0> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, + T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, + T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept + { +#if defined(__clang__) || __GNUC__ + return __extension__(__m512i)(__v32hu) { + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + }; +#else + return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); +#endif + } + + template = 0> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, + T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, + T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31, + T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39, + T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47, + T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55, + T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept + { + +#if defined(__clang__) || __GNUC__ + return __extension__(__m512i)(__v64qi) { + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, + v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63 + }; +#else + return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, + v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63); +#endif + } + template = 0> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, + T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, + T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31, + T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39, + T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47, + T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55, + T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept + { + +#if defined(__clang__) || __GNUC__ + return __extension__(__m512i)(__v64qu) { + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, + v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63 + }; +#else + return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, + v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63); +#endif + } + + template + inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + using register_type = typename batch_bool::register_type; + register_type r = 0; + unsigned shift = 0; + (void)std::initializer_list { (r |= register_type(values ? 1 : 0) << (shift++))... }; + return r; + } + + // slide_left + template + inline batch slide_left(batch const&, requires_arch) noexcept + { + static_assert(N == 0xDEAD, "not implemented yet"); + return {}; + } + + // slide_right + template + inline batch slide_right(batch const&, requires_arch) noexcept + { + static_assert(N == 0xDEAD, "not implemented yet"); + return {}; + } + + // sqrt + template + inline batch sqrt(batch const& val, requires_arch) noexcept + { + return _mm512_sqrt_ps(val); + } + template + inline batch sqrt(batch const& val, requires_arch) noexcept + { + return _mm512_sqrt_pd(val); + } + + // ssub + template ::value, void>::type> + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + return sadd(self, -other); + } + else + { + const auto diff = min(self, other); + return self - diff; + } + } + + // store + template + inline void store(batch_bool const& self, bool* mem, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + constexpr auto size = batch_bool::size; + for (std::size_t i = 0; i < size; ++i) + mem[i] = self.data & (register_type(1) << i); + } + + // store_aligned + template ::value, void>::type> + inline void store_aligned(T* mem, batch const& self, requires_arch) noexcept + { + return _mm512_store_si512((__m512i*)mem, self); + } + template ::value, void>::type> + inline void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept + { + return _mm512_store_si512((__m512i*)mem, self); + } + template + inline void store_aligned(float* mem, batch const& self, requires_arch) noexcept + { + return _mm512_store_ps(mem, self); + } + template + inline void store_aligned(double* mem, batch const& self, requires_arch) noexcept + { + return _mm512_store_pd(mem, self); + } + + // store_unaligned + template ::value, void>::type> + inline void store_unaligned(T* mem, batch const& self, requires_arch) noexcept + { + return _mm512_storeu_si512((__m512i*)mem, self); + } + template ::value, void>::type> + inline void store_unaligned(T* mem, batch_bool const& self, requires_arch) noexcept + { + return _mm512_storeu_si512((__m512i*)mem, self); + } + template + inline void store_unaligned(float* mem, batch const& self, requires_arch) noexcept + { + return _mm512_storeu_ps(mem, self); + } + template + inline void store_unaligned(double* mem, batch const& self, requires_arch) noexcept + { + return _mm512_storeu_pd(mem, self); + } + + // sub + template ::value, void>::type> + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept + { return sub(batch(s), batch(o)); }, + self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept + { return sub(batch(s), batch(o)); }, + self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm512_sub_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm512_sub_epi64(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_sub_ps(self, other); + } + template + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_sub_pd(self, other); + } + + // swizzle + template + inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept + { + return _mm512_permutexvar_ps((batch)mask, self); + } + + template + inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept + { + return _mm512_permutexvar_pd((batch)mask, self); + } + + template + inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept + { + return _mm512_permutexvar_epi64((batch)mask, self); + } + + template + inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512f {})); + } + + template + inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept + { + return _mm512_permutexvar_epi32((batch)mask, self); + } + + template + inline batch swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512f {})); + } + + namespace detail + { + template + struct is_pair_of_contiguous_indices; + + template + struct is_pair_of_contiguous_indices : std::true_type + { + }; + + template + struct is_pair_of_contiguous_indices : std::conditional<(Idx0 % 2 == 0) && (Idx0 + 1 == Idx1), is_pair_of_contiguous_indices, std::false_type>::type + { + }; + + template + struct fold_batch_constant + { + using type = batch_constant, I0 / 2, I2 / 2, I4 / 2, I6 / 2, I8 / 2, I10 / 2, I12 / 2, I14 / 2, + I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>; + }; + + } + + template ::value, void>::type> + inline batch swizzle(batch const& self, batch_constant, Idx...>, requires_arch) noexcept + { + constexpr typename detail::fold_batch_constant::type mask32; + return _mm512_permutexvar_epi32(static_cast>(mask32), self); + } + + template + inline batch + swizzle(batch const& self, batch_constant, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch) noexcept + { + // FIXME: this sequence is very inefficient, but it's here to catch + // a pattern generated by detail::reduce from xsimd_generic_math.hpp. + // The whole pattern is actually decently folded by GCC and Clang, + // so bare with it. + constexpr batch_constant, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32; + auto tmp = _mm512_permutexvar_epi32(static_cast>(mask32), self); + + alignas(A::alignment()) uint16_t buffer[32]; + _mm512_store_si512((__m512i*)&buffer[0], tmp); + buffer[0] = buffer[1]; + return _mm512_load_si512(&buffer[0]); + } + + template + inline batch + swizzle(batch const& self, batch_constant, Vs...> mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512f {})); + } + + // trunc + template + inline batch + trunc(batch const& self, requires_arch) noexcept + { + return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION); + } + template + inline batch + trunc(batch const& self, requires_arch) noexcept + { + return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION); + } + + // zip_hi + template ::value, void>::type> + inline batch + zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + __m512i lo, hi; + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + assert(false && "not implemented yet"); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + assert(false && "not implemented yet"); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + lo = _mm512_unpacklo_epi32(self, other); + hi = _mm512_unpackhi_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + lo = _mm512_unpacklo_epi64(self, other); + hi = _mm512_unpackhi_epi64(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + return _mm512_inserti32x4( + _mm512_inserti32x4( + _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0), + _mm512_extracti32x4_epi32(lo, 3), + 2), + _mm512_extracti32x4_epi32(hi, 2), + 1); + } + template + inline batch + zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + auto lo = _mm512_unpacklo_ps(self, other); + auto hi = _mm512_unpackhi_ps(self, other); + return _mm512_insertf32x4( + _mm512_insertf32x4( + _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0), + _mm512_extractf32x4_ps(lo, 3), + 2), + _mm512_extractf32x4_ps(hi, 2), + 1); + } + template + inline batch + zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other)); + auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other)); + return _mm512_castps_pd(_mm512_insertf32x4( + _mm512_insertf32x4( + _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0), + _mm512_extractf32x4_ps(lo, 3), + 2), + _mm512_extractf32x4_ps(hi, 2), + 1)); + } + + // zip_lo + template ::value, void>::type> + inline batch + zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + __m512i lo, hi; + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + assert(false && "not implemented yet"); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + assert(false && "not implemented yet"); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + lo = _mm512_unpacklo_epi32(self, other); + hi = _mm512_unpackhi_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + lo = _mm512_unpacklo_epi64(self, other); + hi = _mm512_unpackhi_epi64(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + return _mm512_inserti32x4( + _mm512_inserti32x4( + _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1), + _mm512_extracti32x4_epi32(hi, 1), + 3), + _mm512_extracti32x4_epi32(lo, 1), + 2); + } + template + inline batch + zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + auto lo = _mm512_unpacklo_ps(self, other); + auto hi = _mm512_unpackhi_ps(self, other); + return _mm512_insertf32x4( + _mm512_insertf32x4( + _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1), + _mm512_extractf32x4_ps(hi, 1), + 3), + _mm512_extractf32x4_ps(lo, 1), + 2); + } + template + inline batch + zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other)); + auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other)); + return _mm512_castps_pd(_mm512_insertf32x4( + _mm512_insertf32x4( + _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1), + _mm512_extractf32x4_ps(hi, 1), + 3), + _mm512_extractf32x4_ps(lo, 1), + 2)); + } + + } + +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp new file mode 100644 index 0000000000..1ae77e8c7d --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp @@ -0,0 +1,384 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_NUMERICAL_CONSTANT_HPP +#define XSIMD_NUMERICAL_CONSTANT_HPP + +#include + +#include "../types/xsimd_utils.hpp" + +namespace xsimd +{ + + namespace constants + { + +#define XSIMD_DEFINE_CONSTANT(NAME, SINGLE, DOUBLE) \ + template \ + inline T NAME() noexcept \ + { \ + return T(NAME()); \ + } \ + template <> \ + inline float NAME() noexcept \ + { \ + return SINGLE; \ + } \ + template <> \ + inline double NAME() noexcept \ + { \ + return DOUBLE; \ + } + +#define XSIMD_DEFINE_CONSTANT_HEX(NAME, SINGLE, DOUBLE) \ + template \ + inline T NAME() noexcept \ + { \ + return T(NAME()); \ + } \ + template <> \ + inline float NAME() noexcept \ + { \ + return bit_cast((uint32_t)SINGLE); \ + } \ + template <> \ + inline double NAME() noexcept \ + { \ + return bit_cast((uint64_t)DOUBLE); \ + } + + XSIMD_DEFINE_CONSTANT(infinity, (std::numeric_limits::infinity()), (std::numeric_limits::infinity())) + XSIMD_DEFINE_CONSTANT(invlog_2, 1.442695040888963407359924681001892137426645954152986f, 1.442695040888963407359924681001892137426645954152986) + XSIMD_DEFINE_CONSTANT_HEX(invlog_2hi, 0x3fb8b000, 0x3ff7154765200000) + XSIMD_DEFINE_CONSTANT_HEX(invlog_2lo, 0xb9389ad4, 0x3de705fc2eefa200) + XSIMD_DEFINE_CONSTANT(invlog10_2, 3.32192809488736234787031942949f, 3.32192809488736234787031942949) + XSIMD_DEFINE_CONSTANT_HEX(invpi, 0x3ea2f983, 0x3fd45f306dc9c883) + XSIMD_DEFINE_CONSTANT(log_2, 0.6931471805599453094172321214581765680755001343602553f, 0.6931471805599453094172321214581765680755001343602553) + XSIMD_DEFINE_CONSTANT_HEX(log_2hi, 0x3f318000, 0x3fe62e42fee00000) + XSIMD_DEFINE_CONSTANT_HEX(log_2lo, 0xb95e8083, 0x3dea39ef35793c76) + XSIMD_DEFINE_CONSTANT_HEX(log10_2hi, 0x3e9a0000, 0x3fd3440000000000) + XSIMD_DEFINE_CONSTANT_HEX(log10_2lo, 0x39826a14, 0x3ed3509f79fef312) + XSIMD_DEFINE_CONSTANT_HEX(logeps, 0xc17f1402, 0xc04205966f2b4f12) + XSIMD_DEFINE_CONSTANT_HEX(logpi, 0x3f928682, 0x3ff250d048e7a1bd) + XSIMD_DEFINE_CONSTANT_HEX(logsqrt2pi, 0x3f6b3f8e, 0x3fed67f1c864beb5) + XSIMD_DEFINE_CONSTANT(maxflint, 16777216.0f, 9007199254740992.0) + XSIMD_DEFINE_CONSTANT(maxlog, 88.3762626647949f, 709.78271289338400) + XSIMD_DEFINE_CONSTANT(maxlog2, 127.0f, 1023.) + XSIMD_DEFINE_CONSTANT(maxlog10, 38.23080825805664f, 308.2547155599167) + XSIMD_DEFINE_CONSTANT_HEX(mediumpi, 0x43490fdb, 0x412921fb54442d18) + XSIMD_DEFINE_CONSTANT(minlog, -88.3762626647949f, -708.3964185322641) + XSIMD_DEFINE_CONSTANT(minlog2, -127.0f, -1023.) + XSIMD_DEFINE_CONSTANT(minlog10, -37.89999771118164f, -308.2547155599167) + XSIMD_DEFINE_CONSTANT(minusinfinity, (-infinity()), (-infinity())) + XSIMD_DEFINE_CONSTANT(minuszero, -0.0f, -0.0) + XSIMD_DEFINE_CONSTANT_HEX(nan, 0xffffffff, 0xffffffffffffffff) + XSIMD_DEFINE_CONSTANT_HEX(oneosqrteps, 0x453504f3, 0x4190000000000000) + XSIMD_DEFINE_CONSTANT_HEX(oneotwoeps, 0x4a800000, 0x4320000000000000) + XSIMD_DEFINE_CONSTANT_HEX(pi, 0x40490fdb, 0x400921fb54442d18) + XSIMD_DEFINE_CONSTANT_HEX(pio_2lo, 0xb33bbd2e, 0x3c91a62633145c07) + XSIMD_DEFINE_CONSTANT_HEX(pio_4lo, 0xb2bbbd2e, 0x3c81a62633145c07) + XSIMD_DEFINE_CONSTANT_HEX(pio2, 0x3fc90fdb, 0x3ff921fb54442d18) + XSIMD_DEFINE_CONSTANT_HEX(pio2_1, 0x3fc90f80, 0x3ff921fb54400000) + XSIMD_DEFINE_CONSTANT_HEX(pio2_1t, 0x37354443, 0x3dd0b4611a626331) + XSIMD_DEFINE_CONSTANT_HEX(pio2_2, 0x37354400, 0x3dd0b4611a600000) + XSIMD_DEFINE_CONSTANT_HEX(pio2_2t, 0x2e85a308, 0x3ba3198a2e037073) + XSIMD_DEFINE_CONSTANT_HEX(pio2_3, 0x2e85a300, 0x3ba3198a2e000000) + XSIMD_DEFINE_CONSTANT_HEX(pio2_3t, 0x248d3132, 0x397b839a252049c1) + XSIMD_DEFINE_CONSTANT_HEX(pio4, 0x3f490fdb, 0x3fe921fb54442d18) + XSIMD_DEFINE_CONSTANT_HEX(signmask, 0x80000000, 0x8000000000000000) + XSIMD_DEFINE_CONSTANT(smallestposval, std::numeric_limits::min(), std::numeric_limits::min()) + XSIMD_DEFINE_CONSTANT_HEX(sqrt_2pi, 0x40206c99, 0x40040d931ff62704) + XSIMD_DEFINE_CONSTANT_HEX(sqrteps, 0x39b504f3, 0x3e50000000000000) + XSIMD_DEFINE_CONSTANT_HEX(tanpio8, 0x3ed413cd, 0x3fda827999fcef31) + XSIMD_DEFINE_CONSTANT_HEX(tan3pio8, 0x401a827a, 0x4003504f333f9de6) + XSIMD_DEFINE_CONSTANT_HEX(twentypi, 0x427b53d1, 0x404f6a7a2955385e) + XSIMD_DEFINE_CONSTANT_HEX(twoopi, 0x3f22f983, 0x3fe45f306dc9c883) + XSIMD_DEFINE_CONSTANT(twotonmb, 8388608.0f, 4503599627370496.0) + XSIMD_DEFINE_CONSTANT_HEX(twotonmbo3, 0x3ba14518, 0x3ed428a2f98d7286) + +#undef XSIMD_DEFINE_CONSTANT +#undef XSIMD_DEFINE_CONSTANT_HEX + + template + constexpr T allbits() noexcept; + + template + constexpr as_integer_t mask1frexp() noexcept; + + template + constexpr as_integer_t mask2frexp() noexcept; + + template + constexpr as_integer_t maxexponent() noexcept; + + template + constexpr as_integer_t maxexponentm1() noexcept; + + template + constexpr int32_t nmb() noexcept; + + template + constexpr T zero() noexcept; + + template + constexpr T minvalue() noexcept; + + template + constexpr T maxvalue() noexcept; + + /************************** + * allbits implementation * + **************************/ + + namespace detail + { + template ::value> + struct allbits_impl + { + static constexpr T get_value() noexcept + { + return T(~0); + } + }; + + template + struct allbits_impl + { + static constexpr T get_value() noexcept + { + return nan(); + } + }; + } + + template + inline constexpr T allbits() noexcept + { + return T(detail::allbits_impl::get_value()); + } + + /***************************** + * mask1frexp implementation * + *****************************/ + + template + inline constexpr as_integer_t mask1frexp() noexcept + { + return as_integer_t(mask1frexp()); + } + + template <> + inline constexpr int32_t mask1frexp() noexcept + { + return 0x7f800000; + } + + template <> + inline constexpr int64_t mask1frexp() noexcept + { + return 0x7ff0000000000000; + } + + /***************************** + * mask2frexp implementation * + *****************************/ + + template + inline constexpr as_integer_t mask2frexp() noexcept + { + return as_integer_t(mask2frexp()); + } + + template <> + inline constexpr int32_t mask2frexp() noexcept + { + return 0x3f000000; + } + + template <> + inline constexpr int64_t mask2frexp() noexcept + { + return 0x3fe0000000000000; + } + + /****************************** + * maxexponent implementation * + ******************************/ + + template + inline constexpr as_integer_t maxexponent() noexcept + { + return as_integer_t(maxexponent()); + } + + template <> + inline constexpr int32_t maxexponent() noexcept + { + return 127; + } + + template <> + inline constexpr int64_t maxexponent() noexcept + { + return 1023; + } + + /****************************** + * maxexponent implementation * + ******************************/ + + template + inline constexpr as_integer_t maxexponentm1() noexcept + { + return as_integer_t(maxexponentm1()); + } + + template <> + inline constexpr int32_t maxexponentm1() noexcept + { + return 126; + } + + template <> + inline constexpr int64_t maxexponentm1() noexcept + { + return 1022; + } + + /********************** + * nmb implementation * + **********************/ + + template + inline constexpr int32_t nmb() noexcept + { + return nmb(); + } + + template <> + inline constexpr int32_t nmb() noexcept + { + return 23; + } + + template <> + inline constexpr int32_t nmb() noexcept + { + return 52; + } + + /*********************** + * zero implementation * + ***********************/ + + template + inline constexpr T zero() noexcept + { + return T(typename T::value_type(0)); + } + + /*************************** + * minvalue implementation * + ***************************/ + + namespace detail + { + template + struct minvalue_impl + { + static constexpr T get_value() noexcept + { + return std::numeric_limits::min(); + } + }; + + template + struct minvalue_common + { + static constexpr T get_value() noexcept + { + return std::numeric_limits::min(); + } + }; + + template <> + struct minvalue_impl : minvalue_common + { + }; + template <> + struct minvalue_impl : minvalue_common + { + }; + template <> + struct minvalue_impl : minvalue_common + { + }; + template <> + struct minvalue_impl : minvalue_common + { + }; + template <> + struct minvalue_impl : minvalue_common + { + }; + template <> + struct minvalue_impl : minvalue_common + { + }; + template <> + struct minvalue_impl : minvalue_common + { + }; + template <> + struct minvalue_impl : minvalue_common + { + }; + + template <> + struct minvalue_impl + { + static float get_value() noexcept + { + return bit_cast((uint32_t)0xff7fffff); + } + }; + + template <> + struct minvalue_impl + { + static double get_value() noexcept + { + return bit_cast((uint64_t)0xffefffffffffffff); + } + }; + } + + template + inline constexpr T minvalue() noexcept + { + return T(detail::minvalue_impl::get_value()); + } + + /*************************** + * maxvalue implementation * + ***************************/ + + template + inline constexpr T maxvalue() noexcept + { + return T(std::numeric_limits::max()); + } + } + +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp new file mode 100644 index 0000000000..64e9ed65d1 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp @@ -0,0 +1,80 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_FMA3_AVX_HPP +#define XSIMD_FMA3_AVX_HPP + +#include "../types/xsimd_fma3_avx_register.hpp" + +namespace xsimd +{ + + namespace kernel + { + using namespace types; + + // fnma + template + inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept + { + return _mm256_fnmadd_ps(x, y, z); + } + + template + inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept + { + return _mm256_fnmadd_pd(x, y, z); + } + + // fnms + template + inline batch fnms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept + { + return _mm256_fnmsub_ps(x, y, z); + } + + template + inline batch fnms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept + { + return _mm256_fnmsub_pd(x, y, z); + } + + // fma + template + inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept + { + return _mm256_fmadd_ps(x, y, z); + } + + template + inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept + { + return _mm256_fmadd_pd(x, y, z); + } + + // fms + template + inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept + { + return _mm256_fmsub_ps(x, y, z); + } + + template + inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept + { + return _mm256_fmsub_pd(x, y, z); + } + + } + +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp new file mode 100644 index 0000000000..134053951a --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp @@ -0,0 +1,46 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_FMA3_AVX2_HPP +#define XSIMD_FMA3_AVX2_HPP + +#include "../types/xsimd_fma3_avx2_register.hpp" + +// Allow inclusion of xsimd_fma3_avx.hpp +#ifdef XSIMD_FMA3_AVX_HPP +#undef XSIMD_FMA3_AVX_HPP +#define XSIMD_FORCE_FMA3_AVX_HPP +#endif + +// Disallow inclusion of ./xsimd_fma3_avx_register.hpp +#ifndef XSIMD_FMA3_AVX_REGISTER_HPP +#define XSIMD_FMA3_AVX_REGISTER_HPP +#define XSIMD_FORCE_FMA3_AVX_REGISTER_HPP +#endif + +// Include ./xsimd_fma3_avx.hpp but s/avx/avx2 +#define avx avx2 +#include "./xsimd_fma3_avx.hpp" +#undef avx +#undef XSIMD_FMA3_AVX_HPP + +// Carefully restore guards +#ifdef XSIMD_FORCE_FMA3_AVX_HPP +#define XSIMD_FMA3_AVX_HPP +#undef XSIMD_FORCE_FMA3_AVX_HPP +#endif + +#ifdef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP +#undef XSIMD_FMA3_AVX_REGISTER_HPP +#undef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP +#endif + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp new file mode 100644 index 0000000000..55c38f13a4 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp @@ -0,0 +1,79 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_FMA3_SSE_HPP +#define XSIMD_FMA3_SSE_HPP + +#include "../types/xsimd_fma3_sse_register.hpp" + +namespace xsimd +{ + + namespace kernel + { + using namespace types; + // fnma + template + inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept + { + return _mm_fnmadd_ps(x, y, z); + } + + template + inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept + { + return _mm_fnmadd_pd(x, y, z); + } + + // fnms + template + inline batch fnms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept + { + return _mm_fnmsub_ps(x, y, z); + } + + template + inline batch fnms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept + { + return _mm_fnmsub_pd(x, y, z); + } + + // fma + template + inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept + { + return _mm_fmadd_ps(x, y, z); + } + + template + inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept + { + return _mm_fmadd_pd(x, y, z); + } + + // fms + template + inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept + { + return _mm_fmsub_ps(x, y, z); + } + + template + inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept + { + return _mm_fmsub_pd(x, y, z); + } + + } + +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_fma4.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_fma4.hpp new file mode 100644 index 0000000000..6a97d711e9 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma4.hpp @@ -0,0 +1,79 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_FMA4_HPP +#define XSIMD_FMA4_HPP + +#include "../types/xsimd_fma4_register.hpp" + +namespace xsimd +{ + + namespace kernel + { + using namespace types; + + // fnma + template + inline batch fnma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept + { + return _mm_nmacc_ps(x, y, z); + } + + template + inline batch fnma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept + { + return _mm_nmacc_pd(x, y, z); + } + + // fnms + template + inline batch fnms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept + { + return _mm_nmsub_ps(x, y, z); + } + + template + inline batch fnms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept + { + return _mm_nmsub_pd(x, y, z); + } + + // fma + template + inline batch fma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept + { + return _mm_macc_ps(x, y, z); + } + + template + inline batch fma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept + { + return _mm_macc_pd(x, y, z); + } + + // fms + template + inline batch fms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept + { + return _mm_msub_ps(x, y, z); + } + + template + inline batch fms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept + { + return _mm_msub_pd(x, y, z); + } + } + +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_generic.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_generic.hpp new file mode 100644 index 0000000000..6403cfb0fc --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_generic.hpp @@ -0,0 +1,23 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_GENERIC_HPP +#define XSIMD_GENERIC_HPP + +#include "./generic/xsimd_generic_arithmetic.hpp" +#include "./generic/xsimd_generic_complex.hpp" +#include "./generic/xsimd_generic_logical.hpp" +#include "./generic/xsimd_generic_math.hpp" +#include "./generic/xsimd_generic_memory.hpp" +#include "./generic/xsimd_generic_rounding.hpp" +#include "./generic/xsimd_generic_trigo.hpp" + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_generic_fwd.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_generic_fwd.hpp new file mode 100644 index 0000000000..86e398a5ea --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_generic_fwd.hpp @@ -0,0 +1,38 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_GENERIC_FWD_HPP +#define XSIMD_GENERIC_FWD_HPP + +#include "../types/xsimd_batch_constant.hpp" + +#include + +namespace xsimd +{ + namespace kernel + { + // forward declaration + template ::value, void>::type> + inline batch abs(batch const& self, requires_arch) noexcept; + template ::value, void>::type> + inline batch bitwise_lshift(batch const& self, batch const& other, requires_arch) noexcept; + template ::value, void>::type> + inline batch bitwise_rshift(batch const& self, batch const& other, requires_arch) noexcept; + template + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept; + template ::value, void>::type> + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept; + + } +} + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp new file mode 100644 index 0000000000..cf0f796a1e --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp @@ -0,0 +1,86 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_ISA_HPP +#define XSIMD_ISA_HPP + +#include "../config/xsimd_arch.hpp" + +#include "./xsimd_generic_fwd.hpp" + +#if XSIMD_WITH_SSE2 +#include "./xsimd_sse2.hpp" +#endif + +#if XSIMD_WITH_SSE3 +#include "./xsimd_sse3.hpp" +#endif + +#if XSIMD_WITH_SSSE3 +#include "./xsimd_ssse3.hpp" +#endif + +#if XSIMD_WITH_SSE4_1 +#include "./xsimd_sse4_1.hpp" +#endif + +#if XSIMD_WITH_SSE4_2 +#include "./xsimd_sse4_2.hpp" +#endif + +#if XSIMD_WITH_FMA3_SSE +#include "./xsimd_fma3_sse.hpp" +#endif + +#if XSIMD_WITH_FMA4 +#include "./xsimd_fma4.hpp" +#endif + +#if XSIMD_WITH_AVX +#include "./xsimd_avx.hpp" +#endif + +#if XSIMD_WITH_FMA3_AVX +#include "./xsimd_fma3_avx.hpp" +#endif + +#if XSIMD_WITH_AVX2 +#include "./xsimd_avx2.hpp" +#endif + +#if XSIMD_WITH_FMA3_AVX2 +#include "./xsimd_fma3_avx2.hpp" +#endif + +#if XSIMD_WITH_AVX512F +#include "./xsimd_avx512f.hpp" +#endif + +#if XSIMD_WITH_AVX512BW +#include "./xsimd_avx512bw.hpp" +#endif + +#if XSIMD_WITH_NEON +#include "./xsimd_neon.hpp" +#endif + +#if XSIMD_WITH_NEON64 +#include "./xsimd_neon64.hpp" +#endif + +#if XSIMD_WITH_SVE +#include "./xsimd_sve.hpp" +#endif + +// Must come last to have access to all conversion specializations. +#include "./xsimd_generic.hpp" + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp new file mode 100644 index 0000000000..57c662cd63 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp @@ -0,0 +1,2670 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_NEON_HPP +#define XSIMD_NEON_HPP + +#include +#include +#include +#include + +#include "../types/xsimd_neon_register.hpp" +#include "../types/xsimd_utils.hpp" + +// Wrap intrinsics so we can pass them as function pointers +// - OP: intrinsics name prefix, e.g., vorrq +// - RT: type traits to deduce intrinsics return types +#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ + namespace wrap \ + { \ + inline RT OP##_u8(uint8x16_t a, uint8x16_t b) noexcept \ + { \ + return ::OP##_u8(a, b); \ + } \ + inline RT OP##_s8(int8x16_t a, int8x16_t b) noexcept \ + { \ + return ::OP##_s8(a, b); \ + } \ + inline RT OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \ + { \ + return ::OP##_u16(a, b); \ + } \ + inline RT OP##_s16(int16x8_t a, int16x8_t b) noexcept \ + { \ + return ::OP##_s16(a, b); \ + } \ + inline RT OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \ + { \ + return ::OP##_u32(a, b); \ + } \ + inline RT OP##_s32(int32x4_t a, int32x4_t b) noexcept \ + { \ + return ::OP##_s32(a, b); \ + } \ + } + +#define WRAP_BINARY_INT(OP, RT) \ + WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ + namespace wrap \ + { \ + inline RT OP##_u64(uint64x2_t a, uint64x2_t b) noexcept \ + { \ + return ::OP##_u64(a, b); \ + } \ + inline RT OP##_s64(int64x2_t a, int64x2_t b) noexcept \ + { \ + return ::OP##_s64(a, b); \ + } \ + } + +#define WRAP_BINARY_FLOAT(OP, RT) \ + namespace wrap \ + { \ + inline RT OP##_f32(float32x4_t a, float32x4_t b) noexcept \ + { \ + return ::OP##_f32(a, b); \ + } \ + } + +#define WRAP_UNARY_INT_EXCLUDING_64(OP) \ + namespace wrap \ + { \ + inline uint8x16_t OP##_u8(uint8x16_t a) noexcept \ + { \ + return ::OP##_u8(a); \ + } \ + inline int8x16_t OP##_s8(int8x16_t a) noexcept \ + { \ + return ::OP##_s8(a); \ + } \ + inline uint16x8_t OP##_u16(uint16x8_t a) noexcept \ + { \ + return ::OP##_u16(a); \ + } \ + inline int16x8_t OP##_s16(int16x8_t a) noexcept \ + { \ + return ::OP##_s16(a); \ + } \ + inline uint32x4_t OP##_u32(uint32x4_t a) noexcept \ + { \ + return ::OP##_u32(a); \ + } \ + inline int32x4_t OP##_s32(int32x4_t a) noexcept \ + { \ + return ::OP##_s32(a); \ + } \ + } + +#define WRAP_UNARY_INT(OP) \ + WRAP_UNARY_INT_EXCLUDING_64(OP) \ + namespace wrap \ + { \ + inline uint64x2_t OP##_u64(uint64x2_t a) noexcept \ + { \ + return ::OP##_u64(a); \ + } \ + inline int64x2_t OP##_s64(int64x2_t a) noexcept \ + { \ + return ::OP##_s64(a); \ + } \ + } + +#define WRAP_UNARY_FLOAT(OP) \ + namespace wrap \ + { \ + inline float32x4_t OP##_f32(float32x4_t a) noexcept \ + { \ + return ::OP##_f32(a); \ + } \ + } + +// Dummy identity caster to ease coding +inline uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; } +inline int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; } +inline uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; } +inline int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; } +inline uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; } +inline int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; } +inline uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; } +inline int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; } +inline float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; } + +namespace xsimd +{ + template + struct batch_bool_constant; + + namespace kernel + { + using namespace types; + + namespace detail + { + template