From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Sun, 7 Apr 2024 21:33:14 +0200
Subject: Adding upstream version 115.7.0esr.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 third_party/xsimd/Changelog.rst                    |  151 ++
 third_party/xsimd/LICENSE                          |   29 +
 .../arch/generic/xsimd_generic_arithmetic.hpp      |  152 ++
 .../xsimd/arch/generic/xsimd_generic_complex.hpp   |   96 +
 .../xsimd/arch/generic/xsimd_generic_details.hpp   |  239 ++
 .../xsimd/arch/generic/xsimd_generic_logical.hpp   |  163 ++
 .../xsimd/arch/generic/xsimd_generic_math.hpp      | 2418 ++++++++++++++++++
 .../xsimd/arch/generic/xsimd_generic_memory.hpp    |  397 +++
 .../xsimd/arch/generic/xsimd_generic_rounding.hpp  |   72 +
 .../xsimd/arch/generic/xsimd_generic_trigo.hpp     |  969 +++++++
 third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp | 1657 ++++++++++++
 .../xsimd/include/xsimd/arch/xsimd_avx2.hpp        |  950 +++++++
 .../xsimd/include/xsimd/arch/xsimd_avx512bw.hpp    |  627 +++++
 .../xsimd/include/xsimd/arch/xsimd_avx512cd.hpp    |   28 +
 .../xsimd/include/xsimd/arch/xsimd_avx512dq.hpp    |  212 ++
 .../xsimd/include/xsimd/arch/xsimd_avx512f.hpp     | 2028 +++++++++++++++
 .../xsimd/include/xsimd/arch/xsimd_constants.hpp   |  384 +++
 .../xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp    |   80 +
 .../xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp   |   46 +
 .../xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp    |   79 +
 .../xsimd/include/xsimd/arch/xsimd_fma4.hpp        |   79 +
 .../xsimd/include/xsimd/arch/xsimd_generic.hpp     |   23 +
 .../xsimd/include/xsimd/arch/xsimd_generic_fwd.hpp |   38 +
 third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp |   86 +
 .../xsimd/include/xsimd/arch/xsimd_neon.hpp        | 2670 ++++++++++++++++++++
 .../xsimd/include/xsimd/arch/xsimd_neon64.hpp      | 1322 ++++++++++
 .../xsimd/include/xsimd/arch/xsimd_scalar.hpp      | 1043 ++++++++
 .../xsimd/include/xsimd/arch/xsimd_sse2.hpp        | 1695 +++++++++++++
 .../xsimd/include/xsimd/arch/xsimd_sse3.hpp        |   64 +
 .../xsimd/include/xsimd/arch/xsimd_sse4_1.hpp      |  350 +++
 .../xsimd/include/xsimd/arch/xsimd_sse4_2.hpp      |   44 +
 .../xsimd/include/xsimd/arch/xsimd_ssse3.hpp       |  142 ++
 third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp | 1126 +++++++++
 .../xsimd/include/xsimd/config/xsimd_arch.hpp      |  249 ++
 .../xsimd/include/xsimd/config/xsimd_config.hpp    |  350 +++
 .../xsimd/include/xsimd/config/xsimd_cpuid.hpp     |  181 ++
 .../xsimd/include/xsimd/math/xsimd_rem_pio2.hpp    |  719 ++++++
 .../xsimd/memory/xsimd_aligned_allocator.hpp       |  349 +++
 .../xsimd/include/xsimd/memory/xsimd_alignment.hpp |   91 +
 .../include/xsimd/types/xsimd_all_registers.hpp    |   32 +
 .../xsimd/include/xsimd/types/xsimd_api.hpp        | 2310 +++++++++++++++++
 .../include/xsimd/types/xsimd_avx2_register.hpp    |   40 +
 .../xsimd/types/xsimd_avx512bw_register.hpp        |   48 +
 .../xsimd/types/xsimd_avx512cd_register.hpp        |   48 +
 .../xsimd/types/xsimd_avx512dq_register.hpp        |   48 +
 .../include/xsimd/types/xsimd_avx512f_register.hpp |   75 +
 .../include/xsimd/types/xsimd_avx_register.hpp     |   62 +
 .../xsimd/include/xsimd/types/xsimd_batch.hpp      | 1491 +++++++++++
 .../include/xsimd/types/xsimd_batch_constant.hpp   |  147 ++
 .../xsimd/types/xsimd_fma3_avx2_register.hpp       |   46 +
 .../xsimd/types/xsimd_fma3_avx_register.hpp        |   46 +
 .../xsimd/types/xsimd_fma3_sse_register.hpp        |   46 +
 .../include/xsimd/types/xsimd_fma4_register.hpp    |   42 +
 .../include/xsimd/types/xsimd_generic_arch.hpp     |   35 +
 .../include/xsimd/types/xsimd_neon64_register.hpp  |   52 +
 .../include/xsimd/types/xsimd_neon_register.hpp    |  155 ++
 .../xsimd/include/xsimd/types/xsimd_register.hpp   |   94 +
 .../include/xsimd/types/xsimd_sse2_register.hpp    |   61 +
 .../include/xsimd/types/xsimd_sse3_register.hpp    |   45 +
 .../include/xsimd/types/xsimd_sse4_1_register.hpp  |   44 +
 .../include/xsimd/types/xsimd_sse4_2_register.hpp  |   44 +
 .../include/xsimd/types/xsimd_ssse3_register.hpp   |   44 +
 .../include/xsimd/types/xsimd_sve_register.hpp     |  155 ++
 .../xsimd/include/xsimd/types/xsimd_traits.hpp     |  319 +++
 .../xsimd/include/xsimd/types/xsimd_utils.hpp      |  530 ++++
 third_party/xsimd/include/xsimd/xsimd.hpp          |   68 +
 third_party/xsimd/moz.yaml                         |   37 +
 67 files changed, 27562 insertions(+)
 create mode 100644 third_party/xsimd/Changelog.rst
 create mode 100644 third_party/xsimd/LICENSE
 create mode 100644 third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_complex.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_logical.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_rounding.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_trigo.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_fma4.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_generic.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_generic_fwd.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp
 create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp
 create mode 100644 third_party/xsimd/include/xsimd/config/xsimd_arch.hpp
 create mode 100644 third_party/xsimd/include/xsimd/config/xsimd_config.hpp
 create mode 100644 third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp
 create mode 100644 third_party/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp
 create mode 100644 third_party/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp
 create mode 100644 third_party/xsimd/include/xsimd/memory/xsimd_alignment.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_api.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_avx2_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_batch.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_fma4_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_neon64_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_neon_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_sse3_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_traits.hpp
 create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_utils.hpp
 create mode 100644 third_party/xsimd/include/xsimd/xsimd.hpp
 create mode 100644 third_party/xsimd/moz.yaml

(limited to 'third_party/xsimd')

diff --git a/third_party/xsimd/Changelog.rst b/third_party/xsimd/Changelog.rst
new file mode 100644
index 0000000000..fa4acbce37
--- /dev/null
+++ b/third_party/xsimd/Changelog.rst
@@ -0,0 +1,151 @@
+.. Copyright (c) Serge Guelton and Johan Mabille
+   Copyright (c) QuantStack
+
+   Distributed under the terms of the BSD 3-Clause License.
+
+   The full license is in the file LICENSE, distributed with this software.
+
+
+Changelog
+=========
+
+9.0.1
+-----
+
+    * Fix potential ABI issue in SVE support, making ``xsimd::sve`` a type alias to
+      size-dependent type.
+
+9.0.0
+-----
+
+    * Support fixed size SVE
+
+    * Fix a bug in SSSE3 ``xsimd::swizzle`` implementation for ``int8`` and ``int16``
+
+    * Rename ``xsimd::hadd`` into ``xsimd::reduce_add``, provide ``xsimd::reduce_min`` and ``xsimd::reduce_max``
+
+    * Properly report unsupported double for neon on arm32
+
+    * Fill holes in xsimd scalar api
+
+    * Fix ``find_package(xsimd)`` for xtl enabled xsimd
+
+    * Replace ``xsimd::bool_cast`` by ``xsimd::batch_bool_cast``
+
+    * Native ``xsimd::hadd`` for float on arm64
+
+    * Properly static_assert when trying to instantiate an ``xsimd::batch`` of xtl complex
+
+    * Introduce ``xsimd::batch_bool::mask()`` and ``batch_bool::from_mask(...)``
+
+    * Flag some function with ``[[nodiscard]]``
+
+    * Accept both relative and absolute libdir and include dir in xsimd.pc
+
+    * Implement ``xsimd::nearbyint_as_int`` for NEON
+
+    * Add ``xsimd::polar``
+
+    * Speedup double -> F32/I32 gathers
+
+    * Add ``xsimd::slide_left`` and ``xsimd::slide_right``
+
+    * Support integral ``xsimd::swizzles`` on AVX
+
+8.1.0
+-----
+
+    * Add ``xsimd::gather`` and ``xsimd::scatter``
+
+    * Add ``xsimd::nearbyint_as_int``
+
+    * Add ``xsimd::none``
+
+    * Add ``xsimd::reciprocal``
+
+    * Remove batch constructor from memory adress, use ``xsimd::batch<...>::load_(un)aligned`` instead
+
+    * Leave to msvc users the opportunity to manually disable FMA3 on AVX
+
+    * Provide ``xsimd::insert`` to modify a single value from a vector
+
+    * Make ``xsimd::pow`` implementation resilient to ``FE_INVALID``
+
+    * Reciprocal square root support through ``xsimd::rsqrt``
+
+    * NEON: Improve ``xsimd::any`` and ``xsimd::all``
+
+    * Provide type utility to explicitly require a batch of given size and type
+
+    * Implement ``xsimd::swizzle`` on x86, neon and neon64
+
+    * Avx support for ``xsimd::zip_lo`` and ``xsimd::zip_hi``
+
+    * Only use ``_mm256_unpacklo_epi<N>`` on AVX2
+
+    * Provide neon/neon64 conversion function from ``uint(32|64)_t`` to ``(float|double)``
+
+    * Provide SSE/AVX/AVX2 conversion function from ``uint32_t`` to ``float``
+
+    * Provide AVX2 conversion function from ``(u)int64_t`` to ``double``
+
+    * Provide better SSE conversion function from ``uint64_t`` to ``double``
+
+    * Provide better SSE conversion function to ``double``
+
+    * Support logical xor for ``xsimd::batch_bool``
+
+    * Clarify fma support:
+
+        - FMA3 + SSE -> ``xsimd::fma3<sse4_2>``
+        - FMA3 + AVX -> ``xsimd::fma3<avx>``
+        - FMA3 + AVX2 -> ``xsimd::fma3<avx2>``
+        - FMA4 -> ``xsimd::fma4``
+
+    * Allow ``xsimd::transform`` to work with complex types
+
+    * Add missing scalar version of ``xsimd::norm`` and ``xsimd::conj``
+
+8.0.5
+-----
+
+    * Fix neon ``xsimd::hadd`` implementation
+
+    * Detect unsupported architectures and set ``XSIMD_NO_SUPPORTED_ARCHITECTURE``
+      if needs be
+
+8.0.4
+-----
+
+    * Provide some conversion operators for ``float`` -> ``uint32``
+
+    * Improve code generated for AVX2 signed integer comparisons
+
+    * Enable detection of avx512cd and avx512dq, and fix avx512bw detection
+
+    * Enable detection of AVX2+FMA
+
+    * Pick the best compatible architecture in ``xsimd::dispatch``
+
+    * Enables support for FMA when AVX2 is detected on Windows
+
+    * Add missing includes / forward declaration
+
+    * Mark all functions inline and noexcept
+
+    * Assert when using incomplete ``std::initializer_list``
+
+8.0.3
+-----
+
+    * Improve CI & testing, no functional change
+
+8.0.2
+-----
+
+    * Do not use ``_mm256_srai_epi32`` under AVX, it's an AVX2 instruction
+
+8.0.1
+-----
+
+    * Fix invalid constexpr ``std::make_tuple`` usage in neon64
diff --git a/third_party/xsimd/LICENSE b/third_party/xsimd/LICENSE
new file mode 100644
index 0000000000..eee7a54bc9
--- /dev/null
+++ b/third_party/xsimd/LICENSE
@@ -0,0 +1,29 @@
+Copyright (c) 2016, Johan Mabille, Sylvain Corlay, Wolf Vollprecht and Martin Renou
+Copyright (c) 2016, QuantStack
+Copyright (c) 2018, Serge Guelton
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
new file mode 100644
index 0000000000..5b3fef6623
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
@@ -0,0 +1,152 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_ARITHMETIC_HPP
+#define XSIMD_GENERIC_ARITHMETIC_HPP
+
+#include <complex>
+#include <type_traits>
+
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // bitwise_lshift
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept
+                                 { return x << y; },
+                                 self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept
+                                 { return x >> y; },
+                                 self, other);
+        }
+
+        // div
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept -> T
+                                 { return x / y; },
+                                 self, other);
+        }
+
+        // fma
+        template <class A, class T>
+        inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        {
+            return x * y + z;
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> fma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            auto res_r = fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real()));
+            auto res_i = fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag()));
+            return { res_r, res_i };
+        }
+
+        // fms
+        template <class A, class T>
+        inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        {
+            return x * y - z;
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> fms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            auto res_r = fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real()));
+            auto res_i = fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag()));
+            return { res_r, res_i };
+        }
+
+        // fnma
+        template <class A, class T>
+        inline batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        {
+            return -x * y + z;
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> fnma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            auto res_r = -fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real()));
+            auto res_i = -fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag()));
+            return { res_r, res_i };
+        }
+
+        // fnms
+        template <class A, class T>
+        inline batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        {
+            return -x * y - z;
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> fnms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            auto res_r = -fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real()));
+            auto res_i = -fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag()));
+            return { res_r, res_i };
+        }
+
+        // mul
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept -> T
+                                 { return x * y; },
+                                 self, other);
+        }
+
+        // sadd
+        template <class A>
+        inline batch<float, A> sadd(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
+        {
+            return add(self, other); // no saturated arithmetic on floating point numbers
+        }
+        template <class A>
+        inline batch<double, A> sadd(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
+        {
+            return add(self, other); // no saturated arithmetic on floating point numbers
+        }
+
+        // ssub
+        template <class A>
+        inline batch<float, A> ssub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
+        {
+            return sub(self, other); // no saturated arithmetic on floating point numbers
+        }
+        template <class A>
+        inline batch<double, A> ssub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
+        {
+            return sub(self, other); // no saturated arithmetic on floating point numbers
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_complex.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_complex.hpp
new file mode 100644
index 0000000000..ede95ee937
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_complex.hpp
@@ -0,0 +1,96 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_COMPLEX_HPP
+#define XSIMD_GENERIC_COMPLEX_HPP
+
+#include <complex>
+
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // real
+        template <class A, class T>
+        inline batch<T, A> real(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+
+        template <class A, class T>
+        inline batch<T, A> real(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self.real();
+        }
+
+        // imag
+        template <class A, class T>
+        inline batch<T, A> imag(batch<T, A> const& /*self*/, requires_arch<generic>) noexcept
+        {
+            return batch<T, A>(T(0));
+        }
+
+        template <class A, class T>
+        inline batch<T, A> imag(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self.imag();
+        }
+
+        // arg
+        template <class A, class T>
+        inline real_batch_type_t<batch<T, A>> arg(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return atan2(imag(self), real(self));
+        }
+
+        // conj
+        template <class A, class T>
+        inline complex_batch_type_t<batch<T, A>> conj(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return { real(self), -imag(self) };
+        }
+
+        // norm
+        template <class A, class T>
+        inline real_batch_type_t<batch<T, A>> norm(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return { fma(real(self), real(self), imag(self) * imag(self)) };
+        }
+
+        // proj
+        template <class A, class T>
+        inline complex_batch_type_t<batch<T, A>> proj(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = complex_batch_type_t<batch<T, A>>;
+            using real_batch = typename batch_type::real_batch;
+            using real_value_type = typename real_batch::value_type;
+            auto cond = xsimd::isinf(real(self)) || xsimd::isinf(imag(self));
+            return select(cond,
+                          batch_type(constants::infinity<real_batch>(),
+                                     copysign(real_batch(real_value_type(0)), imag(self))),
+                          batch_type(self));
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> isnan(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(isnan(self.real()) || isnan(self.imag()));
+        }
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp
new file mode 100644
index 0000000000..fd66e5d03c
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp
@@ -0,0 +1,239 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_DETAILS_HPP
+#define XSIMD_GENERIC_DETAILS_HPP
+
+#include <complex>
+
+#include "../../math/xsimd_rem_pio2.hpp"
+#include "../../types/xsimd_generic_arch.hpp"
+#include "../../types/xsimd_utils.hpp"
+#include "../xsimd_constants.hpp"
+
+namespace xsimd
+{
+    // Forward declaration. Should we put them in a separate file?
+    template <class T, class A>
+    inline batch<T, A> abs(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> abs(batch<std::complex<T>, A> const& self) noexcept;
+    template <class T, class A>
+    inline bool any(batch_bool<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+    template <class A, class T_out, class T_in>
+    inline batch<T_out, A> batch_cast(batch<T_in, A> const&, batch<T_out, A> const& out) noexcept;
+    template <class T, class A>
+    inline batch<T, A> bitofsign(batch<T, A> const& self) noexcept;
+    template <class T_out, class T_in, class A>
+    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> cos(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> cosh(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> exp(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
+    template <class T, class A>
+    inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
+    template <class T, class A>
+    inline batch<T, A> frexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
+    template <class T, class A, uint64_t... Coefs>
+    inline batch<T, A> horner(const batch<T, A>& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> hypot(const batch<T, A>& self) noexcept;
+    template <class T, class A>
+    inline batch_bool<T, A> is_even(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch_bool<T, A> is_flint(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch_bool<T, A> is_odd(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch_bool<T, A> isinf(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
+    template <class T, class A>
+    inline batch<T, A> log(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> nearbyint(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<as_integer_t<T>, A> nearbyint_as_int(const batch<T, A>& x) noexcept;
+    template <class T, class A>
+    inline T reduce_add(batch<T, A> const&) noexcept;
+    template <class T, class A>
+    inline batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
+    template <class T, class A>
+    inline batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;
+    template <class T, class A>
+    inline batch<T, A> sign(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> signnz(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> sin(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> sinh(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> sqrt(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> tan(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<as_float_t<T>, A> to_float(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<as_integer_t<T>, A> to_int(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> trunc(batch<T, A> const& self) noexcept;
+
+    namespace kernel
+    {
+
+        namespace detail
+        {
+            template <class F, class A, class T, class... Batches>
+            inline batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                constexpr std::size_t size = batch<T, A>::size;
+                alignas(A::alignment()) T self_buffer[size];
+                alignas(A::alignment()) T other_buffer[size];
+                self.store_aligned(&self_buffer[0]);
+                other.store_aligned(&other_buffer[0]);
+                for (std::size_t i = 0; i < size; ++i)
+                {
+                    self_buffer[i] = func(self_buffer[i], other_buffer[i]);
+                }
+                return batch<T, A>::load_aligned(self_buffer);
+            }
+
+            template <class U, class F, class A, class T>
+            inline batch<U, A> apply_transform(F&& func, batch<T, A> const& self) noexcept
+            {
+                static_assert(batch<T, A>::size == batch<U, A>::size,
+                              "Source and destination sizes must match");
+                constexpr std::size_t src_size = batch<T, A>::size;
+                constexpr std::size_t dest_size = batch<U, A>::size;
+                alignas(A::alignment()) T self_buffer[src_size];
+                alignas(A::alignment()) U other_buffer[dest_size];
+                self.store_aligned(&self_buffer[0]);
+                for (std::size_t i = 0; i < src_size; ++i)
+                {
+                    other_buffer[i] = func(self_buffer[i]);
+                }
+                return batch<U, A>::load_aligned(other_buffer);
+            }
+        }
+
+        namespace detail
+        {
+            // Generic conversion handling machinery. Each architecture must define
+            // conversion function when such conversions exits in the form of
+            // intrinsic. Then we use that information to automatically decide whether
+            // to use scalar or vector conversion when doing load / store / batch_cast
+            struct with_fast_conversion
+            {
+            };
+            struct with_slow_conversion
+            {
+            };
+
+            template <class A, class From, class To, class = void>
+            struct conversion_type_impl
+            {
+                using type = with_slow_conversion;
+            };
+
+            using xsimd::detail::void_t;
+
+            template <class A, class From, class To>
+            struct conversion_type_impl<A, From, To,
+                                        void_t<decltype(fast_cast(std::declval<const batch<From, A>&>(),
+                                                                  std::declval<const batch<To, A>&>(),
+                                                                  std::declval<const A&>()))>>
+            {
+                using type = with_fast_conversion;
+            };
+
+            template <class A, class From, class To>
+            using conversion_type = typename conversion_type_impl<A, From, To>::type;
+        }
+
+        namespace detail
+        {
+            /* origin: boost/simdfunction/horn.hpp*/
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B, uint64_t c>
+            inline B coef() noexcept
+            {
+                using value_type = typename B::value_type;
+                return B(bit_cast<value_type>(as_unsigned_integer_t<value_type>(c)));
+            }
+            template <class B>
+            inline B horner(const B&) noexcept
+            {
+                return B(typename B::value_type(0.));
+            }
+
+            template <class B, uint64_t c0>
+            inline B horner(const B&) noexcept
+            {
+                return coef<B, c0>();
+            }
+
+            template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
+            inline B horner(const B& self) noexcept
+            {
+                return fma(self, horner<B, c1, args...>(self), coef<B, c0>());
+            }
+
+            /* origin: boost/simdfunction/horn1.hpp*/
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            inline B horner1(const B&) noexcept
+            {
+                return B(1.);
+            }
+
+            template <class B, uint64_t c0>
+            inline B horner1(const B& x) noexcept
+            {
+                return x + detail::coef<B, c0>();
+            }
+
+            template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
+            inline B horner1(const B& x) noexcept
+            {
+                return fma(x, horner1<B, c1, args...>(x), detail::coef<B, c0>());
+            }
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_logical.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_logical.hpp
new file mode 100644
index 0000000000..dd446e83dd
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_logical.hpp
@@ -0,0 +1,163 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_LOGICAL_HPP
+#define XSIMD_GENERIC_LOGICAL_HPP
+
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // from  mask
+        template <class A, class T>
+        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
+            // This is inefficient but should never be called. It's just a
+            // temporary implementation until arm support is added.
+            for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
+                buffer[i] = mask & (1ull << i);
+            return batch_bool<T, A>::load_aligned(buffer);
+        }
+
+        // ge
+        template <class A, class T>
+        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return other <= self;
+        }
+
+        // gt
+        template <class A, class T>
+        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return other < self;
+        }
+
+        // is_even
+        template <class A, class T>
+        inline batch_bool<T, A> is_even(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return is_flint(self * T(0.5));
+        }
+
+        // is_flint
+        template <class A, class T>
+        inline batch_bool<T, A> is_flint(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            auto frac = select(isnan(self - self), constants::nan<batch<T, A>>(), self - trunc(self));
+            return frac == T(0.);
+        }
+
+        // is_odd
+        template <class A, class T>
+        inline batch_bool<T, A> is_odd(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return is_even(self - T(1.));
+        }
+
+        // isinf
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> isinf(batch<T, A> const&, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(false);
+        }
+        template <class A>
+        inline batch_bool<float, A> isinf(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return abs(self) == std::numeric_limits<float>::infinity();
+        }
+        template <class A>
+        inline batch_bool<double, A> isinf(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return abs(self) == std::numeric_limits<double>::infinity();
+        }
+
+        // isfinite
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> isfinite(batch<T, A> const&, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(true);
+        }
+        template <class A>
+        inline batch_bool<float, A> isfinite(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return (self - self) == 0.f;
+        }
+        template <class A>
+        inline batch_bool<double, A> isfinite(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return (self - self) == 0.;
+        }
+
+        // isnan
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> isnan(batch<T, A> const&, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(false);
+        }
+
+        // le
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return (self < other) || (self == other);
+        }
+
+        // neq
+        template <class A, class T>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return !(other == self);
+        }
+
+        // logical_and
+        template <class A, class T>
+        inline batch<T, A> logical_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept
+                                 { return x && y; },
+                                 self, other);
+        }
+
+        // logical_or
+        template <class A, class T>
+        inline batch<T, A> logical_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept
+                                 { return x || y; },
+                                 self, other);
+        }
+
+        // mask
+        template <class A, class T>
+        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
+            self.store_aligned(buffer);
+            // This is inefficient but should never be called. It's just a
+            // temporary implementation until arm support is added.
+            uint64_t res = 0;
+            for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
+                if (buffer[i])
+                    res |= 1ul << i;
+            return res;
+        }
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
new file mode 100644
index 0000000000..ea2f1567e4
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
@@ -0,0 +1,2418 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_MATH_HPP
+#define XSIMD_GENERIC_MATH_HPP
+
+#include "../xsimd_scalar.hpp"
+#include "./xsimd_generic_details.hpp"
+#include "./xsimd_generic_trigo.hpp"
+
+#include <type_traits>
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+        // abs
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            if (std::is_unsigned<T>::value)
+                return self;
+            else
+            {
+                auto sign = bitofsign(self);
+                auto inv = self ^ sign;
+                return inv - sign;
+            }
+        }
+
+        template <class A, class T>
+        inline batch<T, A> abs(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            return hypot(z.real(), z.imag());
+        }
+
+        // batch_cast
+        template <class A, class T>
+        inline batch<T, A> batch_cast(batch<T, A> const& self, batch<T, A> const&, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+
+        namespace detail
+        {
+            template <class A, class T_out, class T_in>
+            inline batch<T_out, A> batch_cast(batch<T_in, A> const& self, batch<T_out, A> const& out, requires_arch<generic>, with_fast_conversion) noexcept
+            {
+                return fast_cast(self, out, A {});
+            }
+            template <class A, class T_out, class T_in>
+            inline batch<T_out, A> batch_cast(batch<T_in, A> const& self, batch<T_out, A> const&, requires_arch<generic>, with_slow_conversion) noexcept
+            {
+                static_assert(!std::is_same<T_in, T_out>::value, "there should be no conversion for this type combination");
+                using batch_type_in = batch<T_in, A>;
+                using batch_type_out = batch<T_out, A>;
+                static_assert(batch_type_in::size == batch_type_out::size, "compatible sizes");
+                alignas(A::alignment()) T_in buffer_in[batch_type_in::size];
+                alignas(A::alignment()) T_out buffer_out[batch_type_out::size];
+                self.store_aligned(&buffer_in[0]);
+                std::copy(std::begin(buffer_in), std::end(buffer_in), std::begin(buffer_out));
+                return batch_type_out::load_aligned(buffer_out);
+            }
+
+        }
+
+        template <class A, class T_out, class T_in>
+        inline batch<T_out, A> batch_cast(batch<T_in, A> const& self, batch<T_out, A> const& out, requires_arch<generic>) noexcept
+        {
+            return detail::batch_cast(self, out, A {}, detail::conversion_type<A, T_in, T_out> {});
+        }
+
+        // bitofsign
+        template <class A, class T>
+        inline batch<T, A> bitofsign(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            static_assert(std::is_integral<T>::value, "int type implementation");
+            if (std::is_unsigned<T>::value)
+                return batch<T, A>(0);
+            else
+                return self >> (T)(8 * sizeof(T) - 1);
+        }
+
+        template <class A>
+        inline batch<float, A> bitofsign(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self & constants::minuszero<batch<float, A>>();
+        }
+        template <class A>
+        inline batch<double, A> bitofsign(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self & constants::minuszero<batch<double, A>>();
+        }
+
+        // bitwise_cast
+        template <class A, class T>
+        inline batch<T, A> bitwise_cast(batch<T, A> const& self, batch<T, A> const&, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+
+        // cbrt
+        /* origin: boost/simd/arch/common/simd/function/cbrt.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A>
+        inline batch<float, A> cbrt(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type z = abs(self);
+#ifndef XSIMD_NO_DENORMALS
+            auto denormal = z < constants::smallestposval<batch_type>();
+            z = select(denormal, z * constants::twotonmb<batch_type>(), z);
+            batch_type f = select(denormal, constants::twotonmbo3<batch_type>(), batch_type(1.));
+#endif
+            const batch_type CBRT2(bit_cast<float>(0x3fa14518));
+            const batch_type CBRT4(bit_cast<float>(0x3fcb2ff5));
+            const batch_type CBRT2I(bit_cast<float>(0x3f4b2ff5));
+            const batch_type CBRT4I(bit_cast<float>(0x3f214518));
+            using i_type = as_integer_t<batch_type>;
+            i_type e;
+            batch_type x = frexp(z, e);
+            x = detail::horner<batch_type,
+                               0x3ece0609,
+                               0x3f91eb77,
+                               0xbf745265,
+                               0x3f0bf0fe,
+                               0xbe09e49a>(x);
+            auto flag = e >= i_type(0);
+            i_type e1 = abs(e);
+            i_type rem = e1;
+            e1 /= i_type(3);
+            rem -= e1 * i_type(3);
+            e = e1 * sign(e);
+            const batch_type cbrt2 = select(batch_bool_cast<float>(flag), CBRT2, CBRT2I);
+            const batch_type cbrt4 = select(batch_bool_cast<float>(flag), CBRT4, CBRT4I);
+            batch_type fact = select(batch_bool_cast<float>(rem == i_type(1)), cbrt2, batch_type(1.));
+            fact = select(batch_bool_cast<float>(rem == i_type(2)), cbrt4, fact);
+            x = ldexp(x * fact, e);
+            x -= (x - z / (x * x)) * batch_type(1.f / 3.f);
+#ifndef XSIMD_NO_DENORMALS
+            x = (x | bitofsign(self)) * f;
+#else
+            x = x | bitofsign(self);
+#endif
+#ifndef XSIMD_NO_INFINITIES
+            return select(self == batch_type(0.) || isinf(self), self, x);
+#else
+            return select(self == batch_type(0.), self, x);
+#endif
+        }
+
+        template <class A>
+        inline batch<double, A> cbrt(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type z = abs(self);
+#ifndef XSIMD_NO_DENORMALS
+            auto denormal = z < constants::smallestposval<batch_type>();
+            z = select(denormal, z * constants::twotonmb<batch_type>(), z);
+            batch_type f = select(denormal, constants::twotonmbo3<batch_type>(), batch_type(1.));
+#endif
+            const batch_type CBRT2(bit_cast<double>(int64_t(0x3ff428a2f98d728b)));
+            const batch_type CBRT4(bit_cast<double>(int64_t(0x3ff965fea53d6e3d)));
+            const batch_type CBRT2I(bit_cast<double>(int64_t(0x3fe965fea53d6e3d)));
+            const batch_type CBRT4I(bit_cast<double>(int64_t(0x3fe428a2f98d728b)));
+            using i_type = as_integer_t<batch_type>;
+            i_type e;
+            batch_type x = frexp(z, e);
+            x = detail::horner<batch_type,
+                               0x3fd9c0c12122a4feull,
+                               0x3ff23d6ee505873aull,
+                               0xbfee8a4ca3ba37b8ull,
+                               0x3fe17e1fc7e59d58ull,
+                               0xbfc13c93386fdff6ull>(x);
+            auto flag = e >= typename i_type::value_type(0);
+            i_type e1 = abs(e);
+            i_type rem = e1;
+            e1 /= i_type(3);
+            rem -= e1 * i_type(3);
+            e = e1 * sign(e);
+            const batch_type cbrt2 = select(batch_bool_cast<double>(flag), CBRT2, CBRT2I);
+            const batch_type cbrt4 = select(batch_bool_cast<double>(flag), CBRT4, CBRT4I);
+            batch_type fact = select(batch_bool_cast<double>(rem == i_type(1)), cbrt2, batch_type(1.));
+            fact = select(batch_bool_cast<double>(rem == i_type(2)), cbrt4, fact);
+            x = ldexp(x * fact, e);
+            x -= (x - z / (x * x)) * batch_type(1. / 3.);
+            x -= (x - z / (x * x)) * batch_type(1. / 3.);
+#ifndef XSIMD_NO_DENORMALS
+            x = (x | bitofsign(self)) * f;
+#else
+            x = x | bitofsign(self);
+#endif
+#ifndef XSIMD_NO_INFINITIES
+            return select(self == batch_type(0.) || isinf(self), self, x);
+#else
+            return select(self == batch_type(0.), self, x);
+#endif
+        }
+
+        // clip
+        template <class A, class T>
+        inline batch<T, A> clip(batch<T, A> const& self, batch<T, A> const& lo, batch<T, A> const& hi, requires_arch<generic>) noexcept
+        {
+            return min(hi, max(self, lo));
+        }
+
+        // copysign
+        template <class A, class T, class _ = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
+        inline batch<T, A> copysign(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return abs(self) | bitofsign(other);
+        }
+
+        // erf
+
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/erf_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            struct erf_kernel;
+
+            template <class A>
+            struct erf_kernel<batch<float, A>>
+            {
+                using batch_type = batch<float, A>;
+                // computes erf(a0)/a0
+                // x is sqr(a0) and 0 <= abs(a0) <= 2/3
+                static inline batch_type erf1(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0x3f906eba, //   1.128379154774254e+00
+                                          0xbec0937e, //  -3.761252839094832e-01
+                                          0x3de70f22, //   1.128218315189123e-01
+                                          0xbcdb61f4, //  -2.678010670585737e-02
+                                          0x3ba4468d, //   5.013293006147870e-03
+                                          0xba1fc83b //  -6.095205117313012e-04
+                                          >(x);
+                }
+
+                // computes erfc(x)*exp(sqr(x))
+                // x >=  2/3
+                static inline batch_type erfc2(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0x3f0a0e8b, //   5.392844046572836e-01
+                                          0xbf918a62, //  -1.137035586823118e+00
+                                          0x3e243828, //   1.603704761054187e-01
+                                          0x3ec4ca6e, //   3.843569094305250e-01
+                                          0x3e1175c7, //   1.420508523645926e-01
+                                          0x3e2006f0, //   1.562764709849380e-01
+                                          0xbfaea865, //  -1.364514006347145e+00
+                                          0x4050b063, //   3.260765682222576e+00
+                                          0xc0cd1a85, //  -6.409487379234005e+00
+                                          0x40d67e3b, //   6.702908785399893e+00
+                                          0xc0283611 //  -2.628299919293280e+00
+                                          >(x);
+                }
+
+                static inline batch_type erfc3(const batch_type& x) noexcept
+                {
+                    return (batch_type(1.) - x) * detail::horner<batch_type,
+                                                                 0x3f7ffffe, //   9.9999988e-01
+                                                                 0xbe036d7e, //  -1.2834737e-01
+                                                                 0xbfa11698, //  -1.2585020e+00
+                                                                 0xbffc9284, //  -1.9732213e+00
+                                                                 0xc016c985, //  -2.3560498e+00
+                                                                 0x3f2cff3b, //   6.7576951e-01
+                                                                 0xc010d956, //  -2.2632651e+00
+                                                                 0x401b5680, //   2.4271545e+00
+                                                                 0x41aa8e55 //   2.1319498e+01
+                                                                 >(x);
+                }
+            };
+
+            template <class A>
+            struct erf_kernel<batch<double, A>>
+            {
+                using batch_type = batch<double, A>;
+                // computes erf(a0)/a0
+                // x is sqr(a0) and 0 <= abs(a0) <= 0.65
+                static inline batch_type erf1(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0x3ff20dd750429b61ull, // 1.12837916709551
+                                          0x3fc16500f106c0a5ull, // 0.135894887627278
+                                          0x3fa4a59a4f02579cull, // 4.03259488531795E-02
+                                          0x3f53b7664358865aull, // 1.20339380863079E-03
+                                          0x3f110512d5b20332ull // 6.49254556481904E-05
+                                          >(x)
+                        / detail::horner<batch_type,
+                                         0x3ff0000000000000ull, // 1
+                                         0x3fdd0a84eb1ca867ull, // 0.453767041780003
+                                         0x3fb64536ca92ea2full, // 8.69936222615386E-02
+                                         0x3f8166f75999dbd1ull, // 8.49717371168693E-03
+                                         0x3f37ea4332348252ull // 3.64915280629351E-04
+                                         >(x);
+                }
+
+                // computes erfc(x)*exp(x*x)
+                // 0.65 <= abs(x) <= 2.2
+                static inline batch_type erfc2(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0x3feffffffbbb552bull, // 0.999999992049799
+                                          0x3ff54dfe9b258a60ull, // 1.33154163936765
+                                          0x3fec1986509e687bull, // 0.878115804155882
+                                          0x3fd53dd7a67c7e9full, // 0.331899559578213
+                                          0x3fb2488a6b5cb5e5ull, // 7.14193832506776E-02
+                                          0x3f7cf4cfe0aacbb4ull, // 7.06940843763253E-03
+                                          0x0ull // 0
+                                          >(x)
+                        / detail::horner<batch_type,
+                                         0x3ff0000000000000ull, // 1
+                                         0x4003adeae79b9708ull, // 2.45992070144246
+                                         0x40053b1052dca8bdull, // 2.65383972869776
+                                         0x3ff9e677c2777c3cull, // 1.61876655543871
+                                         0x3fe307622fcff772ull, // 0.594651311286482
+                                         0x3fc033c113a7deeeull, // 0.126579413030178
+                                         0x3f89a996639b0d00ull // 1.25304936549413E-02
+                                         >(x);
+                }
+
+                // computes erfc(x)*exp(x*x)
+                // 2.2 <= abs(x) <= 6
+                static inline batch_type erfc3(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0x3fefff5a9e697ae2ull, // 0.99992114009714
+                                          0x3ff9fa202deb88e5ull, // 1.62356584489367
+                                          0x3ff44744306832aeull, // 1.26739901455873
+                                          0x3fe29be1cff90d94ull, // 0.581528574177741
+                                          0x3fc42210f88b9d43ull, // 0.157289620742839
+                                          0x3f971d0907ea7a92ull, // 2.25716982919218E-02
+                                          0x0ll // 0
+                                          >(x)
+                        / detail::horner<batch_type,
+                                         0x3ff0000000000000ull, // 1
+                                         0x400602f24bf3fdb6ull, // 2.75143870676376
+                                         0x400afd487397568full, // 3.37367334657285
+                                         0x400315ffdfd5ce91ull, // 2.38574194785344
+                                         0x3ff0cfd4cb6cde9full, // 1.05074004614827
+                                         0x3fd1d7ab774bb837ull, // 0.278788439273629
+                                         0x3fa47bd61bbb3843ull // 4.00072964526861E-02
+                                         >(x);
+                }
+
+                // computes erfc(rx)*exp(rx*rx)
+                // x >=  6 rx = 1/x
+                static inline batch_type erfc4(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0xbc7e4ad1ec7d0000ll, // -2.627435221016534e-17
+                                          0x3fe20dd750429a16ll, // 5.641895835477182e-01
+                                          0x3db60000e984b501ll, // 2.000889609806154e-11
+                                          0xbfd20dd753ae5dfdll, // -2.820947949598745e-01
+                                          0x3e907e71e046a820ll, // 2.457786367990903e-07
+                                          0x3fdb1494cac06d39ll, // 4.231311779019112e-01
+                                          0x3f34a451701654f1ll, // 3.149699042180451e-04
+                                          0xbff105e6b8ef1a63ll, // -1.063940737150596e+00
+                                          0x3fb505a857e9ccc8ll, // 8.211757799454056e-02
+                                          0x40074fbabc514212ll, // 2.913930388669777e+00
+                                          0x4015ac7631f7ac4fll, // 5.418419628850713e+00
+                                          0xc0457e03041e9d8bll, // -4.298446704382794e+01
+                                          0x4055803d26c4ec4fll, // 8.600373238783617e+01
+                                          0xc0505fce04ec4ec5ll // -6.549694941594051e+01
+                                          >(x);
+                }
+            };
+        }
+        /* origin: boost/simd/arch/common/simd/function/erf.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+
+        template <class A>
+        inline batch<float, A> erf(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type x = abs(self);
+            batch_type r1(0.);
+            auto test1 = x < batch_type(2.f / 3.f);
+            if (any(test1))
+            {
+                r1 = self * detail::erf_kernel<batch_type>::erf1(x * x);
+                if (all(test1))
+                    return r1;
+            }
+            batch_type z = x / (batch_type(1.) + x);
+            z -= batch_type(0.4f);
+            batch_type r2 = batch_type(1.) - exp(-x * x) * detail::erf_kernel<batch_type>::erfc2(z);
+            r2 = select(self < batch_type(0.), -r2, r2);
+            r1 = select(test1, r1, r2);
+#ifndef XSIMD_NO_INFINITIES
+            r1 = select(xsimd::isinf(self), sign(self), r1);
+#endif
+            return r1;
+        }
+
+        template <class A>
+        inline batch<double, A> erf(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type x = abs(self);
+            batch_type xx = x * x;
+            batch_type lim1(0.65);
+            batch_type lim2(2.2);
+            auto test1 = x < lim1;
+            batch_type r1(0.);
+            if (any(test1))
+            {
+                r1 = self * detail::erf_kernel<batch_type>::erf1(xx);
+                if (all(test1))
+                    return r1;
+            }
+            auto test2 = x < lim2;
+            auto test3 = test2 && !test1;
+            batch_type ex = exp(-xx);
+            if (any(test3))
+            {
+                batch_type z = batch_type(1.) - ex * detail::erf_kernel<batch_type>::erfc2(x);
+                batch_type r2 = select(self < batch_type(0.), -z, z);
+                r1 = select(test1, r1, r2);
+                if (all(test1 || test3))
+                    return r1;
+            }
+            batch_type z = batch_type(1.) - ex * detail::erf_kernel<batch_type>::erfc3(x);
+            z = select(self < batch_type(0.), -z, z);
+#ifndef XSIMD_NO_INFINITIES
+            z = select(xsimd::isinf(self), sign(self), z);
+#endif
+            return select(test2, r1, z);
+        }
+
+        // erfc
+        template <class A>
+        inline batch<float, A> erfc(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type x = abs(self);
+            auto test0 = self < batch_type(0.);
+            batch_type r1(0.);
+            batch_type z = x / (batch_type(1.) + x);
+            if (any(3.f * x < 2.f))
+            {
+                r1 = detail::erf_kernel<batch_type>::erfc3(z);
+            }
+            else
+            {
+                z -= batch_type(0.4f);
+                r1 = exp(-x * x) * detail::erf_kernel<batch_type>::erfc2(z);
+            }
+#ifndef XSIMD_NO_INFINITIES
+            r1 = select(x == constants::infinity<batch_type>(), batch_type(0.), r1);
+#endif
+            return select(test0, batch_type(2.) - r1, r1);
+        }
+
+        template <class A>
+        inline batch<double, A> erfc(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type x = abs(self);
+            batch_type xx = x * x;
+            batch_type lim1(0.65);
+            batch_type lim2(2.2);
+            auto test0 = self < batch_type(0.);
+            auto test1 = x < lim1;
+            batch_type r1(0.);
+            if (any(test1))
+            {
+                r1 = batch_type(1.) - x * detail::erf_kernel<batch_type>::erf1(xx);
+                if (all(test1))
+                    return select(test0, batch_type(2.) - r1, r1);
+            }
+            auto test2 = x < lim2;
+            auto test3 = test2 && !test1;
+            batch_type ex = exp(-xx);
+            if (any(test3))
+            {
+                batch_type z = ex * detail::erf_kernel<batch_type>::erfc2(x);
+                r1 = select(test1, r1, z);
+                if (all(test1 || test3))
+                    return select(test0, batch_type(2.) - r1, r1);
+            }
+            batch_type z = ex * detail::erf_kernel<batch_type>::erfc3(x);
+            r1 = select(test2, r1, z);
+#ifndef XSIMD_NO_INFINITIES
+            r1 = select(x == constants::infinity<batch_type>(), batch_type(0.), r1);
+#endif
+            return select(test0, batch_type(2.) - r1, r1);
+        }
+
+        // estrin
+        namespace detail
+        {
+
+            template <class B>
+            struct estrin
+            {
+                B x;
+
+                template <typename... Ts>
+                inline B operator()(const Ts&... coefs) noexcept
+                {
+                    return eval(coefs...);
+                }
+
+            private:
+                inline B eval(const B& c0) noexcept
+                {
+                    return c0;
+                }
+
+                inline B eval(const B& c0, const B& c1) noexcept
+                {
+                    return fma(x, c1, c0);
+                }
+
+                template <size_t... Is, class Tuple>
+                inline B eval(::xsimd::detail::index_sequence<Is...>, const Tuple& tuple)
+                {
+                    return estrin { x * x }(std::get<Is>(tuple)...);
+                }
+
+                template <class... Args>
+                inline B eval(const std::tuple<Args...>& tuple) noexcept
+                {
+                    return eval(::xsimd::detail::make_index_sequence<sizeof...(Args)>(), tuple);
+                }
+
+                template <class... Args>
+                inline B eval(const std::tuple<Args...>& tuple, const B& c0) noexcept
+                {
+                    return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0))));
+                }
+
+                template <class... Args>
+                inline B eval(const std::tuple<Args...>& tuple, const B& c0, const B& c1) noexcept
+                {
+                    return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1))));
+                }
+
+                template <class... Args, class... Ts>
+                inline B eval(const std::tuple<Args...>& tuple, const B& c0, const B& c1, const Ts&... coefs) noexcept
+                {
+                    return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1))), coefs...);
+                }
+
+                template <class... Ts>
+                inline B eval(const B& c0, const B& c1, const Ts&... coefs) noexcept
+                {
+                    return eval(std::make_tuple(eval(c0, c1)), coefs...);
+                }
+            };
+        }
+
+        template <class T, class A, uint64_t... Coefs>
+        inline batch<T, A> estrin(const batch<T, A>& self) noexcept
+        {
+            using batch_type = batch<T, A>;
+            return detail::estrin<batch_type> { self }(detail::coef<batch_type, Coefs>()...);
+        }
+
+        // exp
+        /* origin: boost/simd/arch/common/detail/simd/expo_base.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        namespace detail
+        {
+            enum exp_reduction_tag
+            {
+                exp_tag,
+                exp2_tag,
+                exp10_tag
+            };
+
+            template <class B, exp_reduction_tag Tag>
+            struct exp_reduction_base;
+
+            template <class B>
+            struct exp_reduction_base<B, exp_tag>
+            {
+                static constexpr B maxlog() noexcept
+                {
+                    return constants::maxlog<B>();
+                }
+
+                static constexpr B minlog() noexcept
+                {
+                    return constants::minlog<B>();
+                }
+            };
+
+            template <class B>
+            struct exp_reduction_base<B, exp10_tag>
+            {
+                static constexpr B maxlog() noexcept
+                {
+                    return constants::maxlog10<B>();
+                }
+
+                static constexpr B minlog() noexcept
+                {
+                    return constants::minlog10<B>();
+                }
+            };
+
+            template <class B>
+            struct exp_reduction_base<B, exp2_tag>
+            {
+                static constexpr B maxlog() noexcept
+                {
+                    return constants::maxlog2<B>();
+                }
+
+                static constexpr B minlog() noexcept
+                {
+                    return constants::minlog2<B>();
+                }
+            };
+
+            template <class T, class A, exp_reduction_tag Tag>
+            struct exp_reduction;
+
+            template <class A>
+            struct exp_reduction<float, A, exp_tag> : exp_reduction_base<batch<float, A>, exp_tag>
+            {
+                using batch_type = batch<float, A>;
+                static inline batch_type approx(const batch_type& x) noexcept
+                {
+                    batch_type y = detail::horner<batch_type,
+                                                  0x3f000000, //  5.0000000e-01
+                                                  0x3e2aa9a5, //  1.6666277e-01
+                                                  0x3d2aa957, //  4.1665401e-02
+                                                  0x3c098d8b, //  8.3955629e-03
+                                                  0x3ab778cf //  1.3997796e-03
+                                                  >(x);
+                    return ++fma(y, x * x, x);
+                }
+
+                static inline batch_type reduce(const batch_type& a, batch_type& x) noexcept
+                {
+                    batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
+                    x = fnma(k, constants::log_2hi<batch_type>(), a);
+                    x = fnma(k, constants::log_2lo<batch_type>(), x);
+                    return k;
+                }
+            };
+
+            template <class A>
+            struct exp_reduction<float, A, exp10_tag> : exp_reduction_base<batch<float, A>, exp10_tag>
+            {
+                using batch_type = batch<float, A>;
+                static inline batch_type approx(const batch_type& x) noexcept
+                {
+                    return ++(detail::horner<batch_type,
+                                             0x40135d8e, //    2.3025851e+00
+                                             0x4029a926, //    2.6509490e+00
+                                             0x400237da, //    2.0346589e+00
+                                             0x3f95eb4c, //    1.1712432e+00
+                                             0x3f0aacef, //    5.4170126e-01
+                                             0x3e54dff1 //    2.0788552e-01
+                                             >(x)
+                              * x);
+                }
+
+                static inline batch_type reduce(const batch_type& a, batch_type& x) noexcept
+                {
+                    batch_type k = nearbyint(constants::invlog10_2<batch_type>() * a);
+                    x = fnma(k, constants::log10_2hi<batch_type>(), a);
+                    x -= k * constants::log10_2lo<batch_type>();
+                    return k;
+                }
+            };
+
+            template <class A>
+            struct exp_reduction<float, A, exp2_tag> : exp_reduction_base<batch<float, A>, exp2_tag>
+            {
+                using batch_type = batch<float, A>;
+                static inline batch_type approx(const batch_type& x) noexcept
+                {
+                    batch_type y = detail::horner<batch_type,
+                                                  0x3e75fdf1, //    2.4022652e-01
+                                                  0x3d6356eb, //    5.5502813e-02
+                                                  0x3c1d9422, //    9.6178371e-03
+                                                  0x3ab01218, //    1.3433127e-03
+                                                  0x3922c8c4 //    1.5524315e-04
+                                                  >(x);
+                    return ++fma(y, x * x, x * constants::log_2<batch_type>());
+                }
+
+                static inline batch_type reduce(const batch_type& a, batch_type& x) noexcept
+                {
+                    batch_type k = nearbyint(a);
+                    x = (a - k);
+                    return k;
+                }
+            };
+
+            template <class A>
+            struct exp_reduction<double, A, exp_tag> : exp_reduction_base<batch<double, A>, exp_tag>
+            {
+                using batch_type = batch<double, A>;
+                static inline batch_type approx(const batch_type& x) noexcept
+                {
+                    batch_type t = x * x;
+                    return fnma(t,
+                                detail::horner<batch_type,
+                                               0x3fc555555555553eull,
+                                               0xbf66c16c16bebd93ull,
+                                               0x3f11566aaf25de2cull,
+                                               0xbebbbd41c5d26bf1ull,
+                                               0x3e66376972bea4d0ull>(t),
+                                x);
+                }
+
+                static inline batch_type reduce(const batch_type& a, batch_type& hi, batch_type& lo, batch_type& x) noexcept
+                {
+                    batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
+                    hi = fnma(k, constants::log_2hi<batch_type>(), a);
+                    lo = k * constants::log_2lo<batch_type>();
+                    x = hi - lo;
+                    return k;
+                }
+
+                static inline batch_type finalize(const batch_type& x, const batch_type& c, const batch_type& hi, const batch_type& lo) noexcept
+                {
+                    return batch_type(1.) - (((lo - (x * c) / (batch_type(2.) - c)) - hi));
+                }
+            };
+
+            template <class A>
+            struct exp_reduction<double, A, exp10_tag> : exp_reduction_base<batch<double, A>, exp10_tag>
+            {
+                using batch_type = batch<double, A>;
+                static inline batch_type approx(const batch_type& x) noexcept
+                {
+                    batch_type xx = x * x;
+                    batch_type px = x * detail::horner<batch_type, 0x40a2b4798e134a01ull, 0x40796b7a050349e4ull, 0x40277d9474c55934ull, 0x3fa4fd75f3062dd4ull>(xx);
+                    batch_type x2 = px / (detail::horner1<batch_type, 0x40a03f37650df6e2ull, 0x4093e05eefd67782ull, 0x405545fdce51ca08ull>(xx) - px);
+                    return ++(x2 + x2);
+                }
+
+                static inline batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept
+                {
+                    batch_type k = nearbyint(constants::invlog10_2<batch_type>() * a);
+                    x = fnma(k, constants::log10_2hi<batch_type>(), a);
+                    x = fnma(k, constants::log10_2lo<batch_type>(), x);
+                    return k;
+                }
+
+                static inline batch_type finalize(const batch_type&, const batch_type& c, const batch_type&, const batch_type&) noexcept
+                {
+                    return c;
+                }
+            };
+
+            template <class A>
+            struct exp_reduction<double, A, exp2_tag> : exp_reduction_base<batch<double, A>, exp2_tag>
+            {
+                using batch_type = batch<double, A>;
+                static inline batch_type approx(const batch_type& x) noexcept
+                {
+                    batch_type t = x * x;
+                    return fnma(t,
+                                detail::horner<batch_type,
+                                               0x3fc555555555553eull,
+                                               0xbf66c16c16bebd93ull,
+                                               0x3f11566aaf25de2cull,
+                                               0xbebbbd41c5d26bf1ull,
+                                               0x3e66376972bea4d0ull>(t),
+                                x);
+                }
+
+                static inline batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept
+                {
+                    batch_type k = nearbyint(a);
+                    x = (a - k) * constants::log_2<batch_type>();
+                    return k;
+                }
+
+                static inline batch_type finalize(const batch_type& x, const batch_type& c, const batch_type&, const batch_type&) noexcept
+                {
+                    return batch_type(1.) + x + x * c / (batch_type(2.) - c);
+                }
+            };
+
+            template <exp_reduction_tag Tag, class A>
+            inline batch<float, A> exp(batch<float, A> const& self) noexcept
+            {
+                using batch_type = batch<float, A>;
+                using reducer_t = exp_reduction<float, A, Tag>;
+                batch_type x;
+                batch_type k = reducer_t::reduce(self, x);
+                x = reducer_t::approx(x);
+                x = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(x, to_int(k)));
+                x = select(self >= reducer_t::maxlog(), constants::infinity<batch_type>(), x);
+                return x;
+            }
+
+            template <exp_reduction_tag Tag, class A>
+            inline batch<double, A> exp(batch<double, A> const& self) noexcept
+            {
+                using batch_type = batch<double, A>;
+                using reducer_t = exp_reduction<double, A, Tag>;
+                batch_type hi, lo, x;
+                batch_type k = reducer_t::reduce(self, hi, lo, x);
+                batch_type c = reducer_t::approx(x);
+                c = reducer_t::finalize(x, c, hi, lo);
+                c = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(c, to_int(k)));
+                c = select(self >= reducer_t::maxlog(), constants::infinity<batch_type>(), c);
+                return c;
+            }
+        }
+
+        template <class A, class T>
+        inline batch<T, A> exp(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::exp<detail::exp_tag>(self);
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> exp(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            auto isincos = sincos(self.imag());
+            return exp(self.real()) * batch_type(std::get<1>(isincos), std::get<0>(isincos));
+        }
+
+        // exp10
+        template <class A, class T>
+        inline batch<T, A> exp10(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::exp<detail::exp10_tag>(self);
+        }
+
+        // exp2
+        template <class A, class T>
+        inline batch<T, A> exp2(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::exp<detail::exp2_tag>(self);
+        }
+
+        // expm1
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/expm1_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class A>
+            static inline batch<float, A> expm1(const batch<float, A>& a) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
+                batch_type x = fnma(k, constants::log_2hi<batch_type>(), a);
+                x = fnma(k, constants::log_2lo<batch_type>(), x);
+                batch_type hx = x * batch_type(0.5);
+                batch_type hxs = x * hx;
+                batch_type r = detail::horner<batch_type,
+                                              0X3F800000UL, // 1
+                                              0XBD08887FUL, // -3.3333298E-02
+                                              0X3ACF6DB4UL // 1.582554
+                                              >(hxs);
+                batch_type t = fnma(r, hx, batch_type(3.));
+                batch_type e = hxs * ((r - t) / (batch_type(6.) - x * t));
+                e = fms(x, e, hxs);
+                using i_type = as_integer_t<batch_type>;
+                i_type ik = to_int(k);
+                batch_type two2mk = ::xsimd::bitwise_cast<float>((constants::maxexponent<batch_type>() - ik) << constants::nmb<batch_type>());
+                batch_type y = batch_type(1.) - two2mk - (e - x);
+                return ldexp(y, ik);
+            }
+
+            template <class A>
+            static inline batch<double, A> expm1(const batch<double, A>& a) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
+                batch_type hi = fnma(k, constants::log_2hi<batch_type>(), a);
+                batch_type lo = k * constants::log_2lo<batch_type>();
+                batch_type x = hi - lo;
+                batch_type hxs = x * x * batch_type(0.5);
+                batch_type r = detail::horner<batch_type,
+                                              0X3FF0000000000000ULL,
+                                              0XBFA11111111110F4ULL,
+                                              0X3F5A01A019FE5585ULL,
+                                              0XBF14CE199EAADBB7ULL,
+                                              0X3ED0CFCA86E65239ULL,
+                                              0XBE8AFDB76E09C32DULL>(hxs);
+                batch_type t = batch_type(3.) - r * batch_type(0.5) * x;
+                batch_type e = hxs * ((r - t) / (batch_type(6) - x * t));
+                batch_type c = (hi - x) - lo;
+                e = (x * (e - c) - c) - hxs;
+                using i_type = as_integer_t<batch_type>;
+                i_type ik = to_int(k);
+                batch_type two2mk = ::xsimd::bitwise_cast<double>((constants::maxexponent<batch_type>() - ik) << constants::nmb<batch_type>());
+                batch_type ct1 = batch_type(1.) - two2mk - (e - x);
+                batch_type ct2 = ++(x - (e + two2mk));
+                batch_type y = select(k < batch_type(20.), ct1, ct2);
+                return ldexp(y, ik);
+            }
+
+        }
+
+        template <class A, class T>
+        inline batch<T, A> expm1(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            return select(self < constants::logeps<batch_type>(),
+                          batch_type(-1.),
+                          select(self > constants::maxlog<batch_type>(),
+                                 constants::infinity<batch_type>(),
+                                 detail::expm1(self)));
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> expm1(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch isin = sin(z.imag());
+            real_batch rem1 = expm1(z.real());
+            real_batch re = rem1 + 1.;
+            real_batch si = sin(z.imag() * 0.5);
+            return { rem1 - 2. * re * si * si, re * isin };
+        }
+
+        // polar
+        template <class A, class T>
+        inline batch<std::complex<T>, A> polar(const batch<T, A>& r, const batch<T, A>& theta, requires_arch<generic>) noexcept
+        {
+            auto sincosTheta = sincos(theta);
+            return { r * sincosTheta.second, r * sincosTheta.first };
+        }
+
+        // fdim
+        template <class A, class T>
+        inline batch<T, A> fdim(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return fmax(batch<T, A>(0), self - other);
+        }
+
+        // fmod
+        template <class A, class T>
+        inline batch<T, A> fmod(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return fnma(trunc(self / other), other, self);
+        }
+
+        // frexp
+        /* origin: boost/simd/arch/common/simd/function/ifrexp.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> frexp(const batch<T, A>& self, batch<as_integer_t<T>, A>& exp, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            using int_type = as_integer_t<T>;
+            using i_type = batch<int_type, A>;
+            i_type m1f = constants::mask1frexp<batch_type>();
+            i_type r1 = m1f & ::xsimd::bitwise_cast<int_type>(self);
+            batch_type x = self & ::xsimd::bitwise_cast<T>(~m1f);
+            exp = (r1 >> constants::nmb<batch_type>()) - constants::maxexponentm1<batch_type>();
+            exp = select(batch_bool_cast<typename i_type::value_type>(self != batch_type(0.)), exp, i_type(typename i_type::value_type(0)));
+            return select((self != batch_type(0.)), x | ::xsimd::bitwise_cast<T>(constants::mask2frexp<batch_type>()), batch_type(0.));
+        }
+
+        // from bool
+        template <class A, class T>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return batch<T, A>(self.data) & batch<T, A>(1);
+        }
+
+        // horner
+        template <class T, class A, uint64_t... Coefs>
+        inline batch<T, A> horner(const batch<T, A>& self) noexcept
+        {
+            return detail::horner<batch<T, A>, Coefs...>(self);
+        }
+
+        // hypot
+        template <class A, class T>
+        inline batch<T, A> hypot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return sqrt(fma(self, self, other * other));
+        }
+
+        // ipow
+        template <class A, class T, class ITy>
+        inline batch<T, A> ipow(batch<T, A> const& self, ITy other, requires_arch<generic>) noexcept
+        {
+            return ::xsimd::detail::ipow(self, other);
+        }
+
+        // ldexp
+        /* origin: boost/simd/arch/common/simd/function/ldexp.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> ldexp(const batch<T, A>& self, const batch<as_integer_t<T>, A>& other, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            using itype = as_integer_t<batch_type>;
+            itype ik = other + constants::maxexponent<T>();
+            ik = ik << constants::nmb<T>();
+            return self * ::xsimd::bitwise_cast<T>(ik);
+        }
+
+        // lgamma
+        template <class A, class T>
+        inline batch<T, A> lgamma(batch<T, A> const& self, requires_arch<generic>) noexcept;
+
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/gammaln_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class A>
+            static inline batch<float, A> gammalnB(const batch<float, A>& x) noexcept
+            {
+                return horner<batch<float, A>,
+                              0x3ed87730, //    4.227843421859038E-001
+                              0x3ea51a64, //    3.224669577325661E-001,
+                              0xbd89f07e, //   -6.735323259371034E-002,
+                              0x3ca89ed8, //    2.058355474821512E-002,
+                              0xbbf164fd, //   -7.366775108654962E-003,
+                              0x3b3ba883, //    2.863437556468661E-003,
+                              0xbaabeab1, //   -1.311620815545743E-003,
+                              0x3a1ebb94 //    6.055172732649237E-004
+                              >(x);
+            }
+
+            template <class A>
+            static inline batch<float, A> gammalnC(const batch<float, A>& x) noexcept
+            {
+                return horner<batch<float, A>,
+                              0xbf13c468, //   -5.772156501719101E-001
+                              0x3f528d34, //    8.224670749082976E-001,
+                              0xbecd27a8, //   -4.006931650563372E-001,
+                              0x3e8a898b, //    2.705806208275915E-001,
+                              0xbe53c04f, //   -2.067882815621965E-001,
+                              0x3e2d4dab, //    1.692415923504637E-001,
+                              0xbe22d329, //   -1.590086327657347E-001,
+                              0x3e0c3c4f //    1.369488127325832E-001
+                              >(x);
+            }
+
+            template <class A>
+            static inline batch<float, A> gammaln2(const batch<float, A>& x) noexcept
+            {
+                return horner<batch<float, A>,
+                              0x3daaaa94, //   8.333316229807355E-002f
+                              0xbb358701, //  -2.769887652139868E-003f,
+                              0x3a31fd69 //   6.789774945028216E-004f
+                              >(x);
+            }
+
+            template <class A>
+            static inline batch<double, A> gammaln1(const batch<double, A>& x) noexcept
+            {
+                return horner<batch<double, A>,
+                              0xc12a0c675418055eull, //  -8.53555664245765465627E5
+                              0xc13a45890219f20bull, //  -1.72173700820839662146E6,
+                              0xc131bc82f994db51ull, //  -1.16237097492762307383E6,
+                              0xc1143d73f89089e5ull, //  -3.31612992738871184744E5,
+                              0xc0e2f234355bb93eull, //  -3.88016315134637840924E4,
+                              0xc09589018ff36761ull //  -1.37825152569120859100E3
+                              >(x)
+                    / horner<batch<double, A>,
+                             0xc13ece4b6a11e14aull, //  -2.01889141433532773231E6
+                             0xc1435255892ff34cull, //  -2.53252307177582951285E6,
+                             0xc131628671950043ull, //  -1.13933444367982507207E6,
+                             0xc10aeb84b9744c9bull, //  -2.20528590553854454839E5,
+                             0xc0d0aa0d7b89d757ull, //  -1.70642106651881159223E4,
+                             0xc075fd0d1cf312b2ull, //  -3.51815701436523470549E2,
+                             0x3ff0000000000000ull //   1.00000000000000000000E0
+                             >(x);
+            }
+
+            template <class A>
+            static inline batch<double, A> gammalnA(const batch<double, A>& x) noexcept
+            {
+                return horner<batch<double, A>,
+                              0x3fb555555555554bull, //    8.33333333333331927722E-2
+                              0xbf66c16c16b0a5a1ull, //   -2.77777777730099687205E-3,
+                              0x3f4a019f20dc5ebbull, //    7.93650340457716943945E-4,
+                              0xbf437fbdb580e943ull, //   -5.95061904284301438324E-4,
+                              0x3f4a985027336661ull //    8.11614167470508450300E-4
+                              >(x);
+            }
+
+            /* origin: boost/simd/arch/common/simd/function/gammaln.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            struct lgamma_impl;
+
+            template <class A>
+            struct lgamma_impl<batch<float, A>>
+            {
+                using batch_type = batch<float, A>;
+                static inline batch_type compute(const batch_type& a) noexcept
+                {
+                    auto inf_result = (a <= batch_type(0.)) && is_flint(a);
+                    batch_type x = select(inf_result, constants::nan<batch_type>(), a);
+                    batch_type q = abs(x);
+#ifndef XSIMD_NO_INFINITIES
+                    inf_result = (x == constants::infinity<batch_type>()) || inf_result;
+#endif
+                    auto ltza = a < batch_type(0.);
+                    batch_type r;
+                    batch_type r1 = other(q);
+                    if (any(ltza))
+                    {
+                        r = select(inf_result, constants::infinity<batch_type>(), negative(q, r1));
+                        if (all(ltza))
+                            return r;
+                    }
+                    batch_type r2 = select(ltza, r, r1);
+                    return select(a == constants::minusinfinity<batch_type>(), constants::nan<batch_type>(), select(inf_result, constants::infinity<batch_type>(), r2));
+                }
+
+            private:
+                static inline batch_type negative(const batch_type& q, const batch_type& w) noexcept
+                {
+                    batch_type p = floor(q);
+                    batch_type z = q - p;
+                    auto test2 = z < batch_type(0.5);
+                    z = select(test2, z - batch_type(1.), z);
+                    z = q * sin(z, trigo_pi_tag());
+                    return -log(constants::invpi<batch_type>() * abs(z)) - w;
+                }
+
+                static inline batch_type other(const batch_type& x) noexcept
+                {
+                    auto xlt650 = (x < batch_type(6.5));
+                    batch_type r0x = x;
+                    batch_type r0z = x;
+                    batch_type r0s = batch_type(1.);
+                    batch_type r1 = batch_type(0.);
+                    batch_type p = constants::nan<batch_type>();
+                    if (any(xlt650))
+                    {
+                        batch_type z = batch_type(1.);
+                        batch_type tx = select(xlt650, x, batch_type(0.));
+                        batch_type nx = batch_type(0.);
+                        const batch_type _075 = batch_type(0.75);
+                        const batch_type _150 = batch_type(1.50);
+                        const batch_type _125 = batch_type(1.25);
+                        const batch_type _250 = batch_type(2.50);
+                        auto xge150 = (x >= _150);
+                        auto txgt250 = (tx > _250);
+
+                        // x >= 1.5
+                        while (any(xge150 && txgt250))
+                        {
+                            nx = select(txgt250, nx - batch_type(1.), nx);
+                            tx = select(txgt250, x + nx, tx);
+                            z = select(txgt250, z * tx, z);
+                            txgt250 = (tx > _250);
+                        }
+                        r0x = select(xge150, x + nx - batch_type(2.), x);
+                        r0z = select(xge150, z, r0z);
+                        r0s = select(xge150, batch_type(1.), r0s);
+
+                        // x >= 1.25 && x < 1.5
+                        auto xge125 = (x >= _125);
+                        auto xge125t = xge125 && !xge150;
+                        if (any(xge125))
+                        {
+                            r0x = select(xge125t, x - batch_type(1.), r0x);
+                            r0z = select(xge125t, z * x, r0z);
+                            r0s = select(xge125t, batch_type(-1.), r0s);
+                        }
+
+                        // x >= 0.75 && x < 1.5
+                        batch_bool<float, A> kernelC(false);
+                        auto xge075 = (x >= _075);
+                        auto xge075t = xge075 && !xge125;
+                        if (any(xge075t))
+                        {
+                            kernelC = xge075t;
+                            r0x = select(xge075t, x - batch_type(1.), x);
+                            r0z = select(xge075t, batch_type(1.), r0z);
+                            r0s = select(xge075t, batch_type(-1.), r0s);
+                            p = gammalnC(r0x);
+                        }
+
+                        // tx < 1.5 && x < 0.75
+                        auto txlt150 = (tx < _150) && !xge075;
+                        if (any(txlt150))
+                        {
+                            auto orig = txlt150;
+                            while (any(txlt150))
+                            {
+                                z = select(txlt150, z * tx, z);
+                                nx = select(txlt150, nx + batch_type(1.), nx);
+                                tx = select(txlt150, x + nx, tx);
+                                txlt150 = (tx < _150) && !xge075;
+                            }
+                            r0x = select(orig, r0x + nx - batch_type(2.), r0x);
+                            r0z = select(orig, z, r0z);
+                            r0s = select(orig, batch_type(-1.), r0s);
+                        }
+                        p = select(kernelC, p, gammalnB(r0x));
+                        if (all(xlt650))
+                            return fma(r0x, p, r0s * log(abs(r0z)));
+                    }
+                    r0z = select(xlt650, abs(r0z), x);
+                    batch_type m = log(r0z);
+                    r1 = fma(r0x, p, r0s * m);
+                    batch_type r2 = fma(x - batch_type(0.5), m, constants::logsqrt2pi<batch_type>() - x);
+                    r2 += gammaln2(batch_type(1.) / (x * x)) / x;
+                    return select(xlt650, r1, r2);
+                }
+            };
+
+            template <class A>
+            struct lgamma_impl<batch<double, A>>
+            {
+                using batch_type = batch<double, A>;
+
+                static inline batch_type compute(const batch_type& a) noexcept
+                {
+                    auto inf_result = (a <= batch_type(0.)) && is_flint(a);
+                    batch_type x = select(inf_result, constants::nan<batch_type>(), a);
+                    batch_type q = abs(x);
+#ifndef XSIMD_NO_INFINITIES
+                    inf_result = (q == constants::infinity<batch_type>());
+#endif
+                    auto test = (a < batch_type(-34.));
+                    batch_type r = constants::nan<batch_type>();
+                    if (any(test))
+                    {
+                        r = large_negative(q);
+                        if (all(test))
+                            return select(inf_result, constants::nan<batch_type>(), r);
+                    }
+                    batch_type r1 = other(a);
+                    batch_type r2 = select(test, r, r1);
+                    return select(a == constants::minusinfinity<batch_type>(), constants::nan<batch_type>(), select(inf_result, constants::infinity<batch_type>(), r2));
+                }
+
+            private:
+                static inline batch_type large_negative(const batch_type& q) noexcept
+                {
+                    batch_type w = lgamma(q);
+                    batch_type p = floor(q);
+                    batch_type z = q - p;
+                    auto test2 = (z < batch_type(0.5));
+                    z = select(test2, z - batch_type(1.), z);
+                    z = q * sin(z, trigo_pi_tag());
+                    z = abs(z);
+                    return constants::logpi<batch_type>() - log(z) - w;
+                }
+
+                static inline batch_type other(const batch_type& xx) noexcept
+                {
+                    batch_type x = xx;
+                    auto test = (x < batch_type(13.));
+                    batch_type r1 = batch_type(0.);
+                    if (any(test))
+                    {
+                        batch_type z = batch_type(1.);
+                        batch_type p = batch_type(0.);
+                        batch_type u = select(test, x, batch_type(0.));
+                        auto test1 = (u >= batch_type(3.));
+                        while (any(test1))
+                        {
+                            p = select(test1, p - batch_type(1.), p);
+                            u = select(test1, x + p, u);
+                            z = select(test1, z * u, z);
+                            test1 = (u >= batch_type(3.));
+                        }
+
+                        auto test2 = (u < batch_type(2.));
+                        while (any(test2))
+                        {
+                            z = select(test2, z / u, z);
+                            p = select(test2, p + batch_type(1.), p);
+                            u = select(test2, x + p, u);
+                            test2 = (u < batch_type(2.));
+                        }
+
+                        z = abs(z);
+                        x += p - batch_type(2.);
+                        r1 = x * gammaln1(x) + log(z);
+                        if (all(test))
+                            return r1;
+                    }
+                    batch_type r2 = fma(xx - batch_type(0.5), log(xx), constants::logsqrt2pi<batch_type>() - xx);
+                    batch_type p = batch_type(1.) / (xx * xx);
+                    r2 += gammalnA(p) / xx;
+                    return select(test, r1, r2);
+                }
+            };
+        }
+
+        template <class A, class T>
+        inline batch<T, A> lgamma(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::lgamma_impl<batch<T, A>>::compute(self);
+        }
+
+        // log
+        /* origin: boost/simd/arch/common/simd/function/log.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A>
+        inline batch<float, A> log(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            using int_type = as_integer_t<float>;
+            using i_type = batch<int_type, A>;
+            batch_type x = self;
+            i_type k(0);
+            auto isnez = (self != batch_type(0.));
+#ifndef XSIMD_NO_DENORMALS
+            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
+            if (any(test))
+            {
+                k = select(batch_bool_cast<int_type>(test), k - i_type(23), k);
+                x = select(test, x * batch_type(8388608ul), x);
+            }
+#endif
+            i_type ix = ::xsimd::bitwise_cast<int_type>(x);
+            ix += 0x3f800000 - 0x3f3504f3;
+            k += (ix >> 23) - 0x7f;
+            ix = (ix & i_type(0x007fffff)) + 0x3f3504f3;
+            x = ::xsimd::bitwise_cast<float>(ix);
+            batch_type f = --x;
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3eccce13, 0x3e789e26>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3f2aaaaa, 0x3e91e9ee>(w);
+            batch_type R = t2 + t1;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type dk = to_float(k);
+            batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>()) - hfsq + f);
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(self >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A>
+        inline batch<double, A> log(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            using int_type = as_integer_t<double>;
+            using i_type = batch<int_type, A>;
+
+            batch_type x = self;
+            i_type hx = ::xsimd::bitwise_cast<int_type>(x) >> 32;
+            i_type k(0);
+            auto isnez = (self != batch_type(0.));
+#ifndef XSIMD_NO_DENORMALS
+            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
+            if (any(test))
+            {
+                k = select(batch_bool_cast<int_type>(test), k - i_type(54), k);
+                x = select(test, x * batch_type(18014398509481984ull), x);
+            }
+#endif
+            hx += 0x3ff00000 - 0x3fe6a09e;
+            k += (hx >> 20) - 0x3ff;
+            batch_type dk = to_float(k);
+            hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
+            x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
+
+            batch_type f = --x;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+
+            batch_type t1 = w * detail::horner<batch_type, 0x3fd999999997fa04ll, 0x3fcc71c51d8e78afll, 0x3fc39a09d078c69fll>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3fe5555555555593ll, 0x3fd2492494229359ll, 0x3fc7466496cb03dell, 0x3fc2f112df3e5244ll>(w);
+            batch_type R = t2 + t1;
+            batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>()) - hfsq + f);
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(self >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> log(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            return batch<std::complex<T>, A>(log(abs(z)), atan2(z.imag(), z.real()));
+        }
+
+        // log2
+        template <class A>
+        inline batch<float, A> log2(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            using int_type = as_integer_t<float>;
+            using i_type = batch<int_type, A>;
+            batch_type x = self;
+            i_type k(0);
+            auto isnez = (self != batch_type(0.));
+#ifndef XSIMD_NO_DENORMALS
+            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
+            if (any(test))
+            {
+                k = select(batch_bool_cast<int_type>(test), k - i_type(25), k);
+                x = select(test, x * batch_type(33554432ul), x);
+            }
+#endif
+            i_type ix = ::xsimd::bitwise_cast<int_type>(x);
+            ix += 0x3f800000 - 0x3f3504f3;
+            k += (ix >> 23) - 0x7f;
+            ix = (ix & i_type(0x007fffff)) + 0x3f3504f3;
+            x = ::xsimd::bitwise_cast<float>(ix);
+            batch_type f = --x;
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3eccce13, 0x3e789e26>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3f2aaaaa, 0x3e91e9ee>(w);
+            batch_type R = t1 + t2;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type dk = to_float(k);
+            batch_type r = fma(fms(s, hfsq + R, hfsq) + f, constants::invlog_2<batch_type>(), dk);
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(self >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A>
+        inline batch<double, A> log2(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            using int_type = as_integer_t<double>;
+            using i_type = batch<int_type, A>;
+            batch_type x = self;
+            i_type hx = ::xsimd::bitwise_cast<int_type>(x) >> 32;
+            i_type k(0);
+            auto isnez = (self != batch_type(0.));
+#ifndef XSIMD_NO_DENORMALS
+            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
+            if (any(test))
+            {
+                k = select(batch_bool_cast<typename i_type::value_type>(test), k - i_type(54), k);
+                x = select(test, x * batch_type(18014398509481984ull), x);
+            }
+#endif
+            hx += 0x3ff00000 - 0x3fe6a09e;
+            k += (hx >> 20) - 0x3ff;
+            hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
+            x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
+            batch_type f = --x;
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3fd999999997fa04ll, 0x3fcc71c51d8e78afll, 0x3fc39a09d078c69fll>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3fe5555555555593ll, 0x3fd2492494229359ll, 0x3fc7466496cb03dell, 0x3fc2f112df3e5244ll>(w);
+            batch_type R = t2 + t1;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type hi = f - hfsq;
+            hi = hi & ::xsimd::bitwise_cast<double>((constants::allbits<i_type>() << 32));
+            batch_type lo = fma(s, hfsq + R, f - hi - hfsq);
+            batch_type val_hi = hi * constants::invlog_2hi<batch_type>();
+            batch_type val_lo = fma(lo + hi, constants::invlog_2lo<batch_type>(), lo * constants::invlog_2hi<batch_type>());
+            batch_type dk = to_float(k);
+            batch_type w1 = dk + val_hi;
+            val_lo += (dk - w1) + val_hi;
+            val_hi = w1;
+            batch_type r = val_lo + val_hi;
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(self >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        namespace detail
+        {
+            template <class T, class A>
+            inline batch<T, A> logN_complex_impl(const batch<T, A>& z, typename batch<T, A>::value_type base) noexcept
+            {
+                using batch_type = batch<T, A>;
+                using rv_type = typename batch_type::value_type;
+                return log(z) / batch_type(rv_type(base));
+            }
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> log2(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::logN_complex_impl(self, std::log(2));
+        }
+
+        // log10
+        /* origin: FreeBSD /usr/src/lib/msun/src/e_log10f.c */
+        /*
+         * ====================================================
+         * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+         *
+         * Developed at SunPro, a Sun Microsystems, Inc. business.
+         * Permission to use, copy, modify, and distribute this
+         * software is freely granted, provided that this notice
+         * is preserved.
+         * ====================================================
+         */
+        template <class A>
+        inline batch<float, A> log10(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            const batch_type
+                ivln10hi(4.3432617188e-01f),
+                ivln10lo(-3.1689971365e-05f),
+                log10_2hi(3.0102920532e-01f),
+                log10_2lo(7.9034151668e-07f);
+            using int_type = as_integer_t<float>;
+            using i_type = batch<int_type, A>;
+            batch_type x = self;
+            i_type k(0);
+            auto isnez = (self != batch_type(0.));
+#ifndef XSIMD_NO_DENORMALS
+            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
+            if (any(test))
+            {
+                k = select(batch_bool_cast<int_type>(test), k - i_type(25), k);
+                x = select(test, x * batch_type(33554432ul), x);
+            }
+#endif
+            i_type ix = ::xsimd::bitwise_cast<int_type>(x);
+            ix += 0x3f800000 - 0x3f3504f3;
+            k += (ix >> 23) - 0x7f;
+            ix = (ix & i_type(0x007fffff)) + 0x3f3504f3;
+            x = ::xsimd::bitwise_cast<float>(ix);
+            batch_type f = --x;
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3eccce13, 0x3e789e26>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3f2aaaaa, 0x3e91e9ee>(w);
+            batch_type R = t2 + t1;
+            batch_type dk = to_float(k);
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type hibits = f - hfsq;
+            hibits &= ::xsimd::bitwise_cast<float>(i_type(0xfffff000));
+            batch_type lobits = fma(s, hfsq + R, f - hibits - hfsq);
+            batch_type r = fma(dk, log10_2hi,
+                               fma(hibits, ivln10hi,
+                                   fma(lobits, ivln10hi,
+                                       fma(lobits + hibits, ivln10lo, dk * log10_2lo))));
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(self >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A>
+        inline batch<double, A> log10(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            const batch_type
+                ivln10hi(4.34294481878168880939e-01),
+                ivln10lo(2.50829467116452752298e-11),
+                log10_2hi(3.01029995663611771306e-01),
+                log10_2lo(3.69423907715893078616e-13);
+            using int_type = as_integer_t<double>;
+            using i_type = batch<int_type, A>;
+            batch_type x = self;
+            i_type hx = ::xsimd::bitwise_cast<int_type>(x) >> 32;
+            i_type k(0);
+            auto isnez = (self != batch_type(0.));
+#ifndef XSIMD_NO_DENORMALS
+            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
+            if (any(test))
+            {
+                k = select(batch_bool_cast<int_type>(test), k - i_type(54), k);
+                x = select(test, x * batch_type(18014398509481984ull), x);
+            }
+#endif
+            hx += 0x3ff00000 - 0x3fe6a09e;
+            k += (hx >> 20) - 0x3ff;
+            hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
+            x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
+            batch_type f = --x;
+            batch_type dk = to_float(k);
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3fd999999997fa04ll, 0x3fcc71c51d8e78afll, 0x3fc39a09d078c69fll>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3fe5555555555593ll, 0x3fd2492494229359ll, 0x3fc7466496cb03dell, 0x3fc2f112df3e5244ll>(w);
+            batch_type R = t2 + t1;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type hi = f - hfsq;
+            hi = hi & ::xsimd::bitwise_cast<double>(constants::allbits<i_type>() << 32);
+            batch_type lo = f - hi - hfsq + s * (hfsq + R);
+            batch_type val_hi = hi * ivln10hi;
+            batch_type y = dk * log10_2hi;
+            batch_type val_lo = dk * log10_2lo + (lo + hi) * ivln10lo + lo * ivln10hi;
+            batch_type w1 = y + val_hi;
+            val_lo += (y - w1) + val_hi;
+            val_hi = w1;
+            batch_type r = val_lo + val_hi;
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(self >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> log10(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            return detail::logN_complex_impl(z, std::log(10));
+        }
+
+        // log1p
+        /* origin: boost/simd/arch/common/simd/function/log1p.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A>
+        inline batch<float, A> log1p(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            using int_type = as_integer_t<float>;
+            using i_type = batch<int_type, A>;
+            const batch_type uf = self + batch_type(1.);
+            auto isnez = (uf != batch_type(0.));
+            i_type iu = ::xsimd::bitwise_cast<int_type>(uf);
+            iu += 0x3f800000 - 0x3f3504f3;
+            i_type k = (iu >> 23) - 0x7f;
+            iu = (iu & i_type(0x007fffff)) + 0x3f3504f3;
+            batch_type f = --(::xsimd::bitwise_cast<float>(iu));
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3eccce13, 0x3e789e26>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3f2aaaaa, 0x3e91e9ee>(w);
+            batch_type R = t2 + t1;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type dk = to_float(k);
+            /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */
+            batch_type c = select(batch_bool_cast<float>(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf;
+            batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>() + c) - hfsq + f);
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(uf >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A>
+        inline batch<double, A> log1p(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            using int_type = as_integer_t<double>;
+            using i_type = batch<int_type, A>;
+            const batch_type uf = self + batch_type(1.);
+            auto isnez = (uf != batch_type(0.));
+            i_type hu = ::xsimd::bitwise_cast<int_type>(uf) >> 32;
+            hu += 0x3ff00000 - 0x3fe6a09e;
+            i_type k = (hu >> 20) - 0x3ff;
+            /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */
+            batch_type c = select(batch_bool_cast<double>(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf;
+            hu = (hu & i_type(0x000fffff)) + 0x3fe6a09e;
+            batch_type f = ::xsimd::bitwise_cast<double>((hu << 32) | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(uf)));
+            f = --f;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3fd999999997fa04ll, 0x3fcc71c51d8e78afll, 0x3fc39a09d078c69fll>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3fe5555555555593ll, 0x3fd2492494229359ll, 0x3fc7466496cb03dell, 0x3fc2f112df3e5244ll>(w);
+            batch_type R = t2 + t1;
+            batch_type dk = to_float(k);
+            batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, hfsq + R, dk * constants::log_2lo<batch_type>() + c) - hfsq + f);
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(uf >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> log1p(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            batch_type u = 1 + self;
+            batch_type logu = log(u);
+            return select(u == batch_type(1.),
+                          self,
+                          select(u.real() <= real_batch(0.),
+                                 logu,
+                                 logu * self / (u - batch_type(1.))));
+        }
+
+        // mod
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> mod(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept -> T
+                                 { return x % y; },
+                                 self, other);
+        }
+
+        // nearbyint
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> nearbyint(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+        namespace detail
+        {
+            template <class A, class T>
+            inline batch<T, A> nearbyintf(batch<T, A> const& self) noexcept
+            {
+                using batch_type = batch<T, A>;
+                batch_type s = bitofsign(self);
+                batch_type v = self ^ s;
+                batch_type t2n = constants::twotonmb<batch_type>();
+                // Under fast-math, reordering is possible and the compiler optimizes d
+                // to v. That's not what we want, so prevent compiler optimization here.
+                // FIXME: it may be better to emit a memory barrier here (?).
+#ifdef __FAST_MATH__
+                volatile batch_type d0 = v + t2n;
+                batch_type d = *(batch_type*)(void*)(&d0) - t2n;
+#else
+                batch_type d0 = v + t2n;
+                batch_type d = d0 - t2n;
+#endif
+                return s ^ select(v < t2n, d, v);
+            }
+        }
+        template <class A>
+        inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::nearbyintf(self);
+        }
+        template <class A>
+        inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::nearbyintf(self);
+        }
+
+        // nearbyint_as_int
+        template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> nearbyint_as_int(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        inline batch<as_integer_t<float>, A>
+        nearbyint_as_int(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using U = as_integer_t<float>;
+            return kernel::detail::apply_transform<U>([](float x) noexcept -> U
+                                                      { return std::lroundf(x); },
+                                                      self);
+        }
+
+        template <class A>
+        inline batch<as_integer_t<double>, A>
+        nearbyint_as_int(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using U = as_integer_t<double>;
+            return kernel::detail::apply_transform<U>([](double x) noexcept -> U
+                                                      { return std::llround(x); },
+                                                      self);
+        }
+
+        // nextafter
+        namespace detail
+        {
+            template <class T, class A, bool is_int = std::is_integral<T>::value>
+            struct nextafter_kernel
+            {
+                using batch_type = batch<T, A>;
+
+                static inline batch_type next(batch_type const& b) noexcept
+                {
+                    return b;
+                }
+
+                static inline batch_type prev(batch_type const& b) noexcept
+                {
+                    return b;
+                }
+            };
+
+            template <class T, class A>
+            struct bitwise_cast_batch;
+
+            template <class A>
+            struct bitwise_cast_batch<float, A>
+            {
+                using type = batch<int32_t, A>;
+            };
+
+            template <class A>
+            struct bitwise_cast_batch<double, A>
+            {
+                using type = batch<int64_t, A>;
+            };
+
+            template <class T, class A>
+            struct nextafter_kernel<T, A, false>
+            {
+                using batch_type = batch<T, A>;
+                using int_batch = typename bitwise_cast_batch<T, A>::type;
+                using int_type = typename int_batch::value_type;
+
+                static inline batch_type next(const batch_type& b) noexcept
+                {
+                    batch_type n = ::xsimd::bitwise_cast<T>(::xsimd::bitwise_cast<int_type>(b) + int_type(1));
+                    return select(b == constants::infinity<batch_type>(), b, n);
+                }
+
+                static inline batch_type prev(const batch_type& b) noexcept
+                {
+                    batch_type p = ::xsimd::bitwise_cast<T>(::xsimd::bitwise_cast<int_type>(b) - int_type(1));
+                    return select(b == constants::minusinfinity<batch_type>(), b, p);
+                }
+            };
+        }
+        template <class A, class T>
+        inline batch<T, A> nextafter(batch<T, A> const& from, batch<T, A> const& to, requires_arch<generic>) noexcept
+        {
+            using kernel = detail::nextafter_kernel<T, A>;
+            return select(from == to, from,
+                          select(to > from, kernel::next(from), kernel::prev(from)));
+        }
+
+        // pow
+        /* origin: boost/simd/arch/common/simd/function/pow.hpp*/
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> pow(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const auto zero = batch_type(0.);
+            auto negx = self < zero;
+            auto iszero = self == zero;
+            constexpr T e = static_cast<T>(2.718281828459045);
+            auto adj_self = select(iszero, batch_type(e), abs(self));
+            batch_type z = exp(other * log(adj_self));
+            z = select(iszero, zero, z);
+            z = select(is_odd(other) && negx, -z, z);
+            auto invalid = negx && !(is_flint(other) || isinf(other));
+            return select(invalid, constants::nan<batch_type>(), z);
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> pow(const batch<std::complex<T>, A>& a, const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using cplx_batch = batch<std::complex<T>, A>;
+            using real_batch = typename cplx_batch::real_batch;
+            real_batch absa = abs(a);
+            real_batch arga = arg(a);
+            real_batch x = z.real();
+            real_batch y = z.imag();
+            real_batch r = pow(absa, x);
+            real_batch theta = x * arga;
+            real_batch ze(0);
+            auto cond = (y == ze);
+            r = select(cond, r, r * exp(-y * arga));
+            theta = select(cond, theta, theta + y * log(absa));
+            return select(absa == ze, cplx_batch(ze), cplx_batch(r * cos(theta), r * sin(theta)));
+        }
+
+        // reciprocal
+        template <class T, class A, class = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
+        inline batch<T, A> reciprocal(batch<T, A> const& self,
+                                      requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            return div(batch_type(1), self);
+        }
+
+        // reduce_add
+        template <class A, class T>
+        inline std::complex<T> reduce_add(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return { reduce_add(self.real()), reduce_add(self.imag()) };
+        }
+
+        namespace detail
+        {
+            template <class T, T N>
+            struct split_high
+            {
+                static constexpr T get(T i, T)
+                {
+                    return i >= N ? (i % 2) : i + N;
+                }
+            };
+
+            template <class Op, class A, class T>
+            inline T reduce(Op, batch<T, A> const& self, std::integral_constant<unsigned, 1>) noexcept
+            {
+                return self.get(0);
+            }
+
+            template <class Op, class A, class T, unsigned Lvl>
+            inline T reduce(Op op, batch<T, A> const& self, std::integral_constant<unsigned, Lvl>) noexcept
+            {
+                using index_type = as_unsigned_integer_t<T>;
+                batch<T, A> split = swizzle(self, make_batch_constant<batch<index_type, A>, split_high<index_type, Lvl / 2>>());
+                return reduce(op, op(split, self), std::integral_constant<unsigned, Lvl / 2>());
+            }
+        }
+
+        // reduce_max
+        template <class A, class T>
+        inline T reduce_max(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::reduce([](batch<T, A> const& x, batch<T, A> const& y)
+                                  { return max(x, y); },
+                                  self, std::integral_constant<unsigned, batch<T, A>::size>());
+        }
+
+        // reduce_min
+        template <class A, class T>
+        inline T reduce_min(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::reduce([](batch<T, A> const& x, batch<T, A> const& y)
+                                  { return min(x, y); },
+                                  self, std::integral_constant<unsigned, batch<T, A>::size>());
+        }
+
+        // remainder
+        template <class A>
+        inline batch<float, A> remainder(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
+        {
+            return fnma(nearbyint(self / other), other, self);
+        }
+        template <class A>
+        inline batch<double, A> remainder(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
+        {
+            return fnma(nearbyint(self / other), other, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> remainder(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            auto mod = self % other;
+            return select(mod <= other / 2, mod, mod - other);
+        }
+
+        // select
+        template <class A, class T>
+        inline batch<std::complex<T>, A> select(batch_bool<T, A> const& cond, batch<std::complex<T>, A> const& true_br, batch<std::complex<T>, A> const& false_br, requires_arch<generic>) noexcept
+        {
+            return { select(cond, true_br.real(), false_br.real()), select(cond, true_br.imag(), false_br.imag()) };
+        }
+
+        // sign
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sign(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type res = select(self > batch_type(0), batch_type(1), batch_type(0)) - select(self < batch_type(0), batch_type(1), batch_type(0));
+            return res;
+        }
+
+        namespace detail
+        {
+            template <class T, class A>
+            inline batch<T, A> signf(batch<T, A> const& self) noexcept
+            {
+                using batch_type = batch<T, A>;
+                batch_type res = select(self > batch_type(0.f), batch_type(1.f), batch_type(0.f)) - select(self < batch_type(0.f), batch_type(1.f), batch_type(0.f));
+#ifdef XSIMD_NO_NANS
+                return res;
+#else
+                return select(isnan(self), constants::nan<batch_type>(), res);
+#endif
+            }
+        }
+
+        template <class A>
+        inline batch<float, A> sign(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::signf(self);
+        }
+        template <class A>
+        inline batch<double, A> sign(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::signf(self);
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> sign(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            auto rz = z.real();
+            auto iz = z.imag();
+            return select(rz != real_batch(0.),
+                          batch_type(sign(rz)),
+                          batch_type(sign(iz)));
+        }
+
+        // signnz
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> signnz(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            return (self >> (sizeof(T) * 8 - 1)) | batch_type(1.);
+        }
+
+        namespace detail
+        {
+            template <class T, class A>
+            inline batch<T, A> signnzf(batch<T, A> const& self) noexcept
+            {
+                using batch_type = batch<T, A>;
+#ifndef XSIMD_NO_NANS
+                return select(isnan(self), constants::nan<batch_type>(), batch_type(1.) | (constants::signmask<batch_type>() & self));
+#else
+                return batch_type(1.) | (constants::signmask<batch_type>() & self);
+#endif
+            }
+        }
+
+        template <class A>
+        inline batch<float, A> signnz(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::signnzf(self);
+        }
+        template <class A>
+        inline batch<double, A> signnz(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::signnzf(self);
+        }
+
+        // sqrt
+        template <class A, class T>
+        inline batch<std::complex<T>, A> sqrt(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+
+            constexpr T csqrt_scale_factor = std::is_same<T, float>::value ? 6.7108864e7f : 1.8014398509481984e16;
+            constexpr T csqrt_scale = std::is_same<T, float>::value ? 1.220703125e-4f : 7.450580596923828125e-9;
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = batch<T, A>;
+            real_batch x = z.real();
+            real_batch y = z.imag();
+            real_batch sqrt_x = sqrt(fabs(x));
+            real_batch sqrt_hy = sqrt(0.5 * fabs(y));
+            auto cond = (fabs(x) > real_batch(4.) || fabs(y) > real_batch(4.));
+            x = select(cond, x * 0.25, x * csqrt_scale_factor);
+            y = select(cond, y * 0.25, y * csqrt_scale_factor);
+            real_batch scale = select(cond, real_batch(2.), real_batch(csqrt_scale));
+            real_batch r = abs(batch_type(x, y));
+
+            auto condxp = x > real_batch(0.);
+            real_batch t0 = select(condxp, xsimd::sqrt(0.5 * (r + x)), xsimd::sqrt(0.5 * (r - x)));
+            real_batch r0 = scale * fabs((0.5 * y) / t0);
+            t0 *= scale;
+            real_batch t = select(condxp, t0, r0);
+            r = select(condxp, r0, t0);
+            batch_type resg = select(y < real_batch(0.), batch_type(t, -r), batch_type(t, r));
+            real_batch ze(0.);
+
+            return select(y == ze,
+                          select(x == ze,
+                                 batch_type(ze, ze),
+                                 select(x < ze, batch_type(ze, sqrt_x), batch_type(sqrt_x, ze))),
+                          select(x == ze,
+                                 select(y > ze, batch_type(sqrt_hy, sqrt_hy), batch_type(sqrt_hy, -sqrt_hy)),
+                                 resg));
+        }
+
+        // tgamma
+
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/stirling_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            struct stirling_kernel;
+
+            template <class A>
+            struct stirling_kernel<batch<float, A>>
+            {
+                using batch_type = batch<float, A>;
+                static inline batch_type compute(const batch_type& x) noexcept
+                {
+                    return horner<batch_type,
+                                  0x3daaaaab,
+                                  0x3b638e39,
+                                  0xbb2fb930,
+                                  0xb970b359>(x);
+                }
+
+                static inline batch_type split_limit() noexcept
+                {
+                    return batch_type(bit_cast<float>(uint32_t(0x41d628f6)));
+                }
+
+                static inline batch_type large_limit() noexcept
+                {
+                    return batch_type(bit_cast<float>(uint32_t(0x420c28f3)));
+                }
+            };
+
+            template <class A>
+            struct stirling_kernel<batch<double, A>>
+            {
+                using batch_type = batch<double, A>;
+                static inline batch_type compute(const batch_type& x) noexcept
+                {
+                    return horner<batch_type,
+                                  0x3fb5555555555986ull, //   8.33333333333482257126E-2
+                                  0x3f6c71c71b98c5fdull, //   3.47222221605458667310E-3
+                                  0xbf65f72607d44fd7ull, //  -2.68132617805781232825E-3
+                                  0xbf2e166b27e61d7cull, //  -2.29549961613378126380E-4
+                                  0x3f49cc72592d7293ull //   7.87311395793093628397E-4
+                                  >(x);
+                }
+
+                static inline batch_type split_limit() noexcept
+                {
+                    return batch_type(bit_cast<double>(uint64_t(0x4061e083ba3443d4)));
+                }
+
+                static inline batch_type large_limit() noexcept
+                {
+                    return batch_type(bit_cast<double>(uint64_t(0x4065800000000000)));
+                }
+            };
+
+            /* origin: boost/simd/arch/common/simd/function/stirling.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class T, class A>
+            inline batch<T, A> stirling(const batch<T, A>& a) noexcept
+            {
+                using batch_type = batch<T, A>;
+                const batch_type stirlingsplitlim = stirling_kernel<batch_type>::split_limit();
+                const batch_type stirlinglargelim = stirling_kernel<batch_type>::large_limit();
+                batch_type x = select(a >= batch_type(0.), a, constants::nan<batch_type>());
+                batch_type w = batch_type(1.) / x;
+                w = fma(w, stirling_kernel<batch_type>::compute(w), batch_type(1.));
+                batch_type y = exp(-x);
+                auto test = (x < stirlingsplitlim);
+                batch_type z = x - batch_type(0.5);
+                z = select(test, z, batch_type(0.5) * z);
+                batch_type v = exp(z * log(abs(x)));
+                y *= v;
+                y = select(test, y, y * v);
+                y *= constants::sqrt_2pi<batch_type>() * w;
+#ifndef XSIMD_NO_INFINITIES
+                y = select(isinf(x), x, y);
+#endif
+                return select(x > stirlinglargelim, constants::infinity<batch_type>(), y);
+            }
+
+            /* origin: boost/simd/arch/common/detail/generic/gamma_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            struct tgamma_kernel;
+
+            template <class A>
+            struct tgamma_kernel<batch<float, A>>
+            {
+                using batch_type = batch<float, A>;
+                static inline batch_type compute(const batch_type& x) noexcept
+                {
+                    return horner<batch_type,
+                                  0x3f800000UL, //  9.999999757445841E-01
+                                  0x3ed87799UL, //  4.227874605370421E-01
+                                  0x3ed2d411UL, //  4.117741948434743E-01
+                                  0x3da82a34UL, //  8.211174403261340E-02
+                                  0x3d93ae7cUL, //  7.211014349068177E-02
+                                  0x3b91db14UL, //  4.451165155708328E-03
+                                  0x3ba90c99UL, //  5.158972571345137E-03
+                                  0x3ad28b22UL //  1.606319369134976E-03
+                                  >(x);
+                }
+            };
+
+            template <class A>
+            struct tgamma_kernel<batch<double, A>>
+            {
+                using batch_type = batch<double, A>;
+                static inline batch_type compute(const batch_type& x) noexcept
+                {
+                    return horner<batch_type,
+                                  0x3ff0000000000000ULL, // 9.99999999999999996796E-1
+                                  0x3fdfa1373993e312ULL, // 4.94214826801497100753E-1
+                                  0x3fca8da9dcae7d31ULL, // 2.07448227648435975150E-1
+                                  0x3fa863d918c423d3ULL, // 4.76367800457137231464E-2
+                                  0x3f8557cde9db14b0ULL, // 1.04213797561761569935E-2
+                                  0x3f5384e3e686bfabULL, // 1.19135147006586384913E-3
+                                  0x3f24fcb839982153ULL // 1.60119522476751861407E-4
+                                  >(x)
+                        / horner<batch_type,
+                                 0x3ff0000000000000ULL, //  1.00000000000000000320E00
+                                 0x3fb24944c9cd3c51ULL, //  7.14304917030273074085E-2
+                                 0xbfce071a9d4287c2ULL, // -2.34591795718243348568E-1
+                                 0x3fa25779e33fde67ULL, //  3.58236398605498653373E-2
+                                 0x3f8831ed5b1bb117ULL, //  1.18139785222060435552E-2
+                                 0xBf7240e4e750b44aULL, // -4.45641913851797240494E-3
+                                 0x3f41ae8a29152573ULL, //  5.39605580493303397842E-4
+                                 0xbef8487a8400d3aFULL // -2.31581873324120129819E-5
+                                 >(x);
+                }
+            };
+
+            /* origin: boost/simd/arch/common/simd/function/gamma.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            inline B tgamma_large_negative(const B& a) noexcept
+            {
+                B st = stirling(a);
+                B p = floor(a);
+                B sgngam = select(is_even(p), -B(1.), B(1.));
+                B z = a - p;
+                auto test2 = z < B(0.5);
+                z = select(test2, z - B(1.), z);
+                z = a * sin(z, trigo_pi_tag());
+                z = abs(z);
+                return sgngam * constants::pi<B>() / (z * st);
+            }
+
+            template <class B, class BB>
+            inline B tgamma_other(const B& a, const BB& test) noexcept
+            {
+                B x = select(test, B(2.), a);
+#ifndef XSIMD_NO_INFINITIES
+                auto inf_result = (a == constants::infinity<B>());
+                x = select(inf_result, B(2.), x);
+#endif
+                B z = B(1.);
+                auto test1 = (x >= B(3.));
+                while (any(test1))
+                {
+                    x = select(test1, x - B(1.), x);
+                    z = select(test1, z * x, z);
+                    test1 = (x >= B(3.));
+                }
+                test1 = (x < B(0.));
+                while (any(test1))
+                {
+                    z = select(test1, z / x, z);
+                    x = select(test1, x + B(1.), x);
+                    test1 = (x < B(0.));
+                }
+                auto test2 = (x < B(2.));
+                while (any(test2))
+                {
+                    z = select(test2, z / x, z);
+                    x = select(test2, x + B(1.), x);
+                    test2 = (x < B(2.));
+                }
+                x = z * tgamma_kernel<B>::compute(x - B(2.));
+#ifndef XSIMD_NO_INFINITIES
+                return select(inf_result, a, x);
+#else
+                return x;
+#endif
+            }
+        }
+
+        template <class A, class T>
+        inline batch<T, A> tgamma(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            auto nan_result = (self < batch_type(0.) && is_flint(self));
+#ifndef XSIMD_NO_INVALIDS
+            nan_result = isnan(self) || nan_result;
+#endif
+            batch_type q = abs(self);
+            auto test = (self < batch_type(-33.));
+            batch_type r = constants::nan<batch_type>();
+            if (any(test))
+            {
+                r = detail::tgamma_large_negative(q);
+                if (all(test))
+                    return select(nan_result, constants::nan<batch_type>(), r);
+            }
+            batch_type r1 = detail::tgamma_other(self, test);
+            batch_type r2 = select(test, r, r1);
+            return select(self == batch_type(0.), copysign(constants::infinity<batch_type>(), self), select(nan_result, constants::nan<batch_type>(), r2));
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
new file mode 100644
index 0000000000..bb40ddffc6
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
@@ -0,0 +1,397 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_MEMORY_HPP
+#define XSIMD_GENERIC_MEMORY_HPP
+
+#include <algorithm>
+#include <complex>
+#include <stdexcept>
+
+#include "../../types/xsimd_batch_constant.hpp"
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+    template <class batch_type, typename batch_type::value_type... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // extract_pair
+        template <class A, class T>
+        inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<generic>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            assert(i < size && "index in bounds");
+
+            alignas(A::alignment()) T self_buffer[size];
+            self.store_aligned(self_buffer);
+
+            alignas(A::alignment()) T other_buffer[size];
+            other.store_aligned(other_buffer);
+
+            alignas(A::alignment()) T concat_buffer[size];
+
+            for (std::size_t j = 0; j < (size - i); ++j)
+            {
+                concat_buffer[j] = other_buffer[i + j];
+                if (j < i)
+                {
+                    concat_buffer[size - 1 - j] = self_buffer[i - 1 - j];
+                }
+            }
+            return batch<T, A>::load_aligned(concat_buffer);
+        }
+
+        // gather
+        namespace detail
+        {
+            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
+            inline batch<T, A> gather(U const* src, batch<V, A> const& index,
+                                      ::xsimd::index<N> I) noexcept
+            {
+                return insert(batch<T, A> {}, static_cast<T>(src[index.get(I)]), I);
+            }
+
+            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
+            inline batch<T, A>
+            gather(U const* src, batch<V, A> const& index, ::xsimd::index<N> I) noexcept
+            {
+                static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
+
+                const auto test = gather<N - 1, T, A>(src, index, {});
+                return insert(test, static_cast<T>(src[index.get(I)]), I);
+            }
+        } // namespace detail
+
+        template <typename T, typename A, typename V>
+        inline batch<T, A>
+        gather(batch<T, A> const&, T const* src, batch<V, A> const& index,
+               kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Index and destination sizes must match");
+
+            return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
+        }
+
+        // Gather with runtime indexes and mismatched strides.
+        template <typename T, typename A, typename U, typename V>
+        inline detail::sizes_mismatch_t<T, U, batch<T, A>>
+        gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
+               kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Index and destination sizes must match");
+
+            return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
+        }
+
+        // Gather with runtime indexes and matching strides.
+        template <typename T, typename A, typename U, typename V>
+        inline detail::stride_match_t<T, U, batch<T, A>>
+        gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
+               kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Index and destination sizes must match");
+
+            return batch_cast<T>(kernel::gather(batch<U, A> {}, src, index, A {}));
+        }
+
+        // insert
+        template <class A, class T, size_t I>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept
+        {
+            struct index_mask
+            {
+                static constexpr bool get(size_t index, size_t /* size*/)
+                {
+                    return index != I;
+                }
+            };
+            batch<T, A> tmp(val);
+            return select(make_batch_bool_constant<batch<T, A>, index_mask>(), self, tmp);
+        }
+
+        // get
+        template <class A, size_t I, class T>
+        inline T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch<T, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[I];
+        }
+
+        template <class A, size_t I, class T>
+        inline T get(batch_bool<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch_bool<T, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[I];
+        }
+
+        template <class A, size_t I, class T>
+        inline auto get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
+        {
+            alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[I];
+        }
+
+        template <class A, class T>
+        inline T get(batch<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch<T, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[i];
+        }
+
+        template <class A, class T>
+        inline T get(batch_bool<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[i];
+        }
+
+        template <class A, class T>
+        inline auto get(batch<std::complex<T>, A> const& self, std::size_t i, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
+        {
+            using T2 = typename batch<std::complex<T>, A>::value_type;
+            alignas(A::alignment()) T2 buffer[batch<std::complex<T>, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[i];
+        }
+
+        // load_aligned
+        namespace detail
+        {
+            template <class A, class T_in, class T_out>
+            inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
+            {
+                using batch_type_in = batch<T_in, A>;
+                using batch_type_out = batch<T_out, A>;
+                return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {});
+            }
+            template <class A, class T_in, class T_out>
+            inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_slow_conversion) noexcept
+            {
+                static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
+                using batch_type_out = batch<T_out, A>;
+                alignas(A::alignment()) T_out buffer[batch_type_out::size];
+                std::copy(mem, mem + batch_type_out::size, std::begin(buffer));
+                return batch_type_out::load_aligned(buffer);
+            }
+        }
+        template <class A, class T_in, class T_out>
+        inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
+        {
+            return detail::load_aligned<A>(mem, cvt, A {}, detail::conversion_type<A, T_in, T_out> {});
+        }
+
+        // load_unaligned
+        namespace detail
+        {
+            template <class A, class T_in, class T_out>
+            inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
+            {
+                using batch_type_in = batch<T_in, A>;
+                using batch_type_out = batch<T_out, A>;
+                return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A {});
+            }
+
+            template <class A, class T_in, class T_out>
+            inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>, with_slow_conversion) noexcept
+            {
+                static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
+                return load_aligned<A>(mem, cvt, generic {}, with_slow_conversion {});
+            }
+        }
+        template <class A, class T_in, class T_out>
+        inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
+        {
+            return detail::load_unaligned<A>(mem, cvt, generic {}, detail::conversion_type<A, T_in, T_out> {});
+        }
+
+        namespace detail
+        {
+            // Scatter with runtime indexes.
+            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
+            inline void scatter(batch<T, A> const& src, U* dst,
+                                batch<V, A> const& index,
+                                ::xsimd::index<N> I) noexcept
+            {
+                dst[index.get(I)] = static_cast<U>(src.get(I));
+            }
+
+            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
+            inline void
+            scatter(batch<T, A> const& src, U* dst, batch<V, A> const& index,
+                    ::xsimd::index<N> I) noexcept
+            {
+                static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
+
+                kernel::detail::scatter<N - 1, T, A, U, V>(
+                    src, dst, index, {});
+                dst[index.get(I)] = static_cast<U>(src.get(I));
+            }
+        } // namespace detail
+
+        template <typename A, typename T, typename V>
+        inline void
+        scatter(batch<T, A> const& src, T* dst,
+                batch<V, A> const& index,
+                kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Source and index sizes must match");
+            kernel::detail::scatter<batch<V, A>::size - 1, T, A, T, V>(
+                src, dst, index, {});
+        }
+
+        template <typename A, typename T, typename U, typename V>
+        inline detail::sizes_mismatch_t<T, U, void>
+        scatter(batch<T, A> const& src, U* dst,
+                batch<V, A> const& index,
+                kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Source and index sizes must match");
+            kernel::detail::scatter<batch<V, A>::size - 1, T, A, U, V>(
+                src, dst, index, {});
+        }
+
+        template <typename A, typename T, typename U, typename V>
+        inline detail::stride_match_t<T, U, void>
+        scatter(batch<T, A> const& src, U* dst,
+                batch<V, A> const& index,
+                kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Source and index sizes must match");
+            const auto tmp = batch_cast<U>(src);
+            kernel::scatter<A>(tmp, dst, index, A {});
+        }
+
+        // store
+        template <class T, class A>
+        inline void store(batch_bool<T, A> const& self, bool* mem, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            constexpr auto size = batch_bool<T, A>::size;
+            alignas(A::alignment()) T buffer[size];
+            kernel::store_aligned<A>(&buffer[0], batch_type(self), A {});
+            for (std::size_t i = 0; i < size; ++i)
+                mem[i] = bool(buffer[i]);
+        }
+
+        // store_aligned
+        template <class A, class T_in, class T_out>
+        inline void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
+        {
+            static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
+            alignas(A::alignment()) T_in buffer[batch<T_in, A>::size];
+            store_aligned(&buffer[0], self);
+            std::copy(std::begin(buffer), std::end(buffer), mem);
+        }
+
+        // store_unaligned
+        template <class A, class T_in, class T_out>
+        inline void store_unaligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
+        {
+            static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
+            return store_aligned<A>(mem, self, generic {});
+        }
+
+        // swizzle
+        template <class A, class T, class ITy, ITy... Vs>
+        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<batch<ITy, A>, Vs...> mask, requires_arch<generic>) noexcept
+        {
+            return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
+        }
+
+        namespace detail
+        {
+            template <class A, class T>
+            inline batch<std::complex<T>, A> load_complex(batch<T, A> const& /*hi*/, batch<T, A> const& /*lo*/, requires_arch<generic>) noexcept
+            {
+                static_assert(std::is_same<T, void>::value, "load_complex not implemented for the required architecture");
+            }
+
+            template <class A, class T>
+            inline batch<T, A> complex_high(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
+            {
+                static_assert(std::is_same<T, void>::value, "complex_high not implemented for the required architecture");
+            }
+
+            template <class A, class T>
+            inline batch<T, A> complex_low(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
+            {
+                static_assert(std::is_same<T, void>::value, "complex_low not implemented for the required architecture");
+            }
+        }
+
+        // load_complex_aligned
+        template <class A, class T_out, class T_in>
+        inline batch<std::complex<T_out>, A> load_complex_aligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
+        {
+            using real_batch = batch<T_out, A>;
+            T_in const* buffer = reinterpret_cast<T_in const*>(mem);
+            real_batch hi = real_batch::load_aligned(buffer),
+                       lo = real_batch::load_aligned(buffer + real_batch::size);
+            return detail::load_complex(hi, lo, A {});
+        }
+
+        // load_complex_unaligned
+        template <class A, class T_out, class T_in>
+        inline batch<std::complex<T_out>, A> load_complex_unaligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
+        {
+            using real_batch = batch<T_out, A>;
+            T_in const* buffer = reinterpret_cast<T_in const*>(mem);
+            real_batch hi = real_batch::load_unaligned(buffer),
+                       lo = real_batch::load_unaligned(buffer + real_batch::size);
+            return detail::load_complex(hi, lo, A {});
+        }
+
+        // store_complex_aligned
+        template <class A, class T_out, class T_in>
+        inline void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
+        {
+            using real_batch = batch<T_in, A>;
+            real_batch hi = detail::complex_high(src, A {});
+            real_batch lo = detail::complex_low(src, A {});
+            T_out* buffer = reinterpret_cast<T_out*>(dst);
+            lo.store_aligned(buffer);
+            hi.store_aligned(buffer + real_batch::size);
+        }
+
+        // store_compelx_unaligned
+        template <class A, class T_out, class T_in>
+        inline void store_complex_unaligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
+        {
+            using real_batch = batch<T_in, A>;
+            real_batch hi = detail::complex_high(src, A {});
+            real_batch lo = detail::complex_low(src, A {});
+            T_out* buffer = reinterpret_cast<T_out*>(dst);
+            lo.store_unaligned(buffer);
+            hi.store_unaligned(buffer + real_batch::size);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_rounding.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_rounding.hpp
new file mode 100644
index 0000000000..b6a79a4515
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_rounding.hpp
@@ -0,0 +1,72 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_ROUNDING_HPP
+#define XSIMD_GENERIC_ROUNDING_HPP
+
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // ceil
+        template <class A, class T>
+        inline batch<T, A> ceil(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            batch<T, A> truncated_self = trunc(self);
+            return select(truncated_self < self, truncated_self + 1, truncated_self);
+        }
+
+        // floor
+        template <class A, class T>
+        inline batch<T, A> floor(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            batch<T, A> truncated_self = trunc(self);
+            return select(truncated_self > self, truncated_self - 1, truncated_self);
+        }
+
+        // round
+        template <class A, class T>
+        inline batch<T, A> round(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            auto v = abs(self);
+            auto c = ceil(v);
+            auto cp = select(c - 0.5 > v, c - 1, c);
+            return select(v > constants::maxflint<batch<T, A>>(), self, copysign(cp, self));
+        }
+
+        // trunc
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> trunc(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+        template <class A>
+        inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return select(abs(self) < constants::maxflint<batch<float, A>>(), to_float(to_int(self)), self);
+        }
+        template <class A>
+        inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return select(abs(self) < constants::maxflint<batch<double, A>>(), to_float(to_int(self)), self);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_trigo.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_trigo.hpp
new file mode 100644
index 0000000000..2568a7253f
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_trigo.hpp
@@ -0,0 +1,969 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_TRIGO_HPP
+#define XSIMD_GENERIC_TRIGO_HPP
+
+#include "./xsimd_generic_details.hpp"
+
+#include <array>
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        /* origin: boost/simd/arch/common/detail/simd/trig_base.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+
+        using namespace types;
+
+        // acos
+        template <class A, class T>
+        inline batch<T, A> acos(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = abs(self);
+            auto x_larger_05 = x > batch_type(0.5);
+            x = select(x_larger_05, sqrt(fma(batch_type(-0.5), x, batch_type(0.5))), self);
+            x = asin(x);
+            x = select(x_larger_05, x + x, x);
+            x = select(self < batch_type(-0.5), constants::pi<batch_type>() - x, x);
+            return select(x_larger_05, x, constants::pio2<batch_type>() - x);
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> acos(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            batch_type tmp = asin(z);
+            return { constants::pio2<real_batch>() - tmp.real(), -tmp.imag() };
+        }
+
+        // acosh
+        /* origin: boost/simd/arch/common/simd/function/acosh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> acosh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = self - batch_type(1.);
+            auto test = x > constants::oneotwoeps<batch_type>();
+            batch_type z = select(test, self, x + sqrt(x + x + x * x));
+            batch_type l1pz = log1p(z);
+            return select(test, l1pz + constants::log_2<batch_type>(), l1pz);
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> acosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            batch_type w = acos(z);
+            w = batch_type(-w.imag(), w.real());
+            return w;
+        }
+
+        // asin
+        template <class A>
+        inline batch<float, A> asin(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type x = abs(self);
+            batch_type sign = bitofsign(self);
+            auto x_larger_05 = x > batch_type(0.5);
+            batch_type z = select(x_larger_05, batch_type(0.5) * (batch_type(1.) - x), x * x);
+            x = select(x_larger_05, sqrt(z), x);
+            batch_type z1 = detail::horner<batch_type,
+                                           0x3e2aaae4,
+                                           0x3d9980f6,
+                                           0x3d3a3ec7,
+                                           0x3cc617e3,
+                                           0x3d2cb352>(z);
+            z1 = fma(z1, z * x, x);
+            z = select(x_larger_05, constants::pio2<batch_type>() - (z1 + z1), z1);
+            return z ^ sign;
+        }
+        template <class A>
+        inline batch<double, A> asin(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type x = abs(self);
+            auto small_cond = x < constants::sqrteps<batch_type>();
+            batch_type ct1 = batch_type(bit_cast<double>(int64_t(0x3fe4000000000000)));
+            batch_type zz1 = batch_type(1.) - x;
+            batch_type vp = zz1 * detail::horner<batch_type, 0x403c896240f3081dull, 0xc03991aaac01ab68ull, 0x401bdff5baf33e6aull, 0xbfe2079259f9290full, 0x3f684fc3988e9f08ull>(zz1) / detail::horner1<batch_type, 0x40756709b0b644beull, 0xc077fe08959063eeull, 0x40626219af6a7f42ull, 0xc035f2a2b6bf5d8cull>(zz1);
+            zz1 = sqrt(zz1 + zz1);
+            batch_type z = constants::pio4<batch_type>() - zz1;
+            zz1 = fms(zz1, vp, constants::pio_2lo<batch_type>());
+            z = z - zz1;
+            zz1 = z + constants::pio4<batch_type>();
+            batch_type zz2 = self * self;
+            z = zz2 * detail::horner<batch_type, 0xc020656c06ceafd5ull, 0x40339007da779259ull, 0xc0304331de27907bull, 0x4015c74b178a2dd9ull, 0xbfe34341333e5c16ull, 0x3f716b9b0bd48ad3ull>(zz2) / detail::horner1<batch_type, 0xc04898220a3607acull, 0x4061705684ffbf9dull, 0xc06265bb6d3576d7ull, 0x40519fc025fe9054ull, 0xc02d7b590b5e0eabull>(zz2);
+            zz2 = fma(x, z, x);
+            return select(x > batch_type(1.), constants::nan<batch_type>(),
+                          select(small_cond, x,
+                                 select(x > ct1, zz1, zz2))
+                              ^ bitofsign(self));
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> asin(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch x = z.real();
+            real_batch y = z.imag();
+
+            batch_type ct(-y, x);
+            batch_type zz(real_batch(1.) - (x - y) * (x + y), -2 * x * y);
+            zz = log(ct + sqrt(zz));
+            batch_type resg(zz.imag(), -zz.real());
+
+            return select(y == real_batch(0.),
+                          select(fabs(x) > real_batch(1.),
+                                 batch_type(constants::pio2<real_batch>(), real_batch(0.)),
+                                 batch_type(asin(x), real_batch(0.))),
+                          resg);
+        }
+
+        // asinh
+        /* origin: boost/simd/arch/common/simd/function/asinh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        namespace detail
+        {
+            template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+            inline batch<T, A>
+            average(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
+            {
+                return (x1 & x2) + ((x1 ^ x2) >> 1);
+            }
+
+            template <class A, class T>
+            inline batch<T, A>
+            averagef(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
+            {
+                using batch_type = batch<T, A>;
+                return fma(x1, batch_type(0.5), x2 * batch_type(0.5));
+            }
+            template <class A>
+            inline batch<float, A> average(batch<float, A> const& x1, batch<float, A> const& x2) noexcept
+            {
+                return averagef(x1, x2);
+            }
+            template <class A>
+            inline batch<double, A> average(batch<double, A> const& x1, batch<double, A> const& x2) noexcept
+            {
+                return averagef(x1, x2);
+            }
+        }
+        template <class A>
+        inline batch<float, A> asinh(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type x = abs(self);
+            auto lthalf = x < batch_type(0.5);
+            batch_type x2 = x * x;
+            batch_type bts = bitofsign(self);
+            batch_type z(0.);
+            if (any(lthalf))
+            {
+                z = detail::horner<batch_type,
+                                   0x3f800000,
+                                   0xbe2aa9ad,
+                                   0x3d9949b1,
+                                   0xbd2ee581,
+                                   0x3ca4d6e6>(x2)
+                    * x;
+                if (all(lthalf))
+                    return z ^ bts;
+            }
+            batch_type tmp = select(x > constants::oneosqrteps<batch_type>(), x, detail::average(x, hypot(batch_type(1.), x)));
+#ifndef XSIMD_NO_NANS
+            return select(isnan(self), constants::nan<batch_type>(), select(lthalf, z, log(tmp) + constants::log_2<batch_type>()) ^ bts);
+#else
+            return select(lthalf, z, log(tmp) + constants::log_2<batch_type>()) ^ bts;
+#endif
+        }
+        template <class A>
+        inline batch<double, A> asinh(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type x = abs(self);
+            auto test = x > constants::oneosqrteps<batch_type>();
+            batch_type z = select(test, x - batch_type(1.), x + x * x / (batch_type(1.) + hypot(batch_type(1.), x)));
+#ifndef XSIMD_NO_INFINITIES
+            z = select(x == constants::infinity<batch_type>(), x, z);
+#endif
+            batch_type l1pz = log1p(z);
+            z = select(test, l1pz + constants::log_2<batch_type>(), l1pz);
+            return bitofsign(self) ^ z;
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> asinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            batch_type w = asin(batch_type(-z.imag(), z.real()));
+            w = batch_type(w.imag(), -w.real());
+            return w;
+        }
+
+        // atan
+        namespace detail
+        {
+            template <class A>
+            static inline batch<float, A> kernel_atan(const batch<float, A>& x, const batch<float, A>& recx) noexcept
+            {
+                using batch_type = batch<float, A>;
+                const auto flag1 = x < constants::tan3pio8<batch_type>();
+                const auto flag2 = (x >= batch_type(bit_cast<float>((uint32_t)0x3ed413cd))) && flag1;
+                batch_type yy = select(flag1, batch_type(0.), constants::pio2<batch_type>());
+                yy = select(flag2, constants::pio4<batch_type>(), yy);
+                batch_type xx = select(flag1, x, -recx);
+                xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx);
+                const batch_type z = xx * xx;
+                batch_type z1 = detail::horner<batch_type,
+                                               0xbeaaaa2aul,
+                                               0x3e4c925ful,
+                                               0xbe0e1b85ul,
+                                               0x3da4f0d1ul>(z);
+                z1 = fma(xx, z1 * z, xx);
+                z1 = select(flag2, z1 + constants::pio_4lo<batch_type>(), z1);
+                z1 = select(!flag1, z1 + constants::pio_2lo<batch_type>(), z1);
+                return yy + z1;
+            }
+            template <class A>
+            static inline batch<double, A> kernel_atan(const batch<double, A>& x, const batch<double, A>& recx) noexcept
+            {
+                using batch_type = batch<double, A>;
+                const auto flag1 = x < constants::tan3pio8<batch_type>();
+                const auto flag2 = (x >= constants::tanpio8<batch_type>()) && flag1;
+                batch_type yy = select(flag1, batch_type(0.), constants::pio2<batch_type>());
+                yy = select(flag2, constants::pio4<batch_type>(), yy);
+                batch_type xx = select(flag1, x, -recx);
+                xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx);
+                batch_type z = xx * xx;
+                z *= detail::horner<batch_type,
+                                    0xc0503669fd28ec8eull,
+                                    0xc05eb8bf2d05ba25ull,
+                                    0xc052c08c36880273ull,
+                                    0xc03028545b6b807aull,
+                                    0xbfec007fa1f72594ull>(z)
+                    / detail::horner1<batch_type,
+                                      0x4068519efbbd62ecull,
+                                      0x407e563f13b049eaull,
+                                      0x407b0e18d2e2be3bull,
+                                      0x4064a0dd43b8fa25ull,
+                                      0x4038dbc45b14603cull>(z);
+                z = fma(xx, z, xx);
+                z = select(flag2, z + constants::pio_4lo<batch_type>(), z);
+                z = z + select(flag1, batch_type(0.), constants::pio_2lo<batch_type>());
+                return yy + z;
+            }
+        }
+        template <class A, class T>
+        inline batch<T, A> atan(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type absa = abs(self);
+            const batch_type x = detail::kernel_atan(absa, batch_type(1.) / absa);
+            return x ^ bitofsign(self);
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> atan(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch x = z.real();
+            real_batch y = z.imag();
+            real_batch x2 = x * x;
+            real_batch one(1.);
+            real_batch a = one - x2 - (y * y);
+            real_batch w = 0.5 * atan2(2. * x, a);
+            real_batch num = y + one;
+            num = x2 + num * num;
+            real_batch den = y - one;
+            den = x2 + den * den;
+            batch_type res = select((x == real_batch(0.)) && (y == real_batch(1.)),
+                                    batch_type(real_batch(0.), constants::infinity<real_batch>()),
+                                    batch_type(w, 0.25 * log(num / den)));
+            return res;
+        }
+
+        // atanh
+        /* origin: boost/simd/arch/common/simd/function/acosh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> atanh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = abs(self);
+            batch_type t = x + x;
+            batch_type z = batch_type(1.) - x;
+            auto test = x < batch_type(0.5);
+            batch_type tmp = select(test, x, t) / z;
+            return bitofsign(self) ^ (batch_type(0.5) * log1p(select(test, fma(t, tmp, t), tmp)));
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> atanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            batch_type w = atan(batch_type(-z.imag(), z.real()));
+            w = batch_type(w.imag(), -w.real());
+            return w;
+        }
+
+        // atan2
+        template <class A, class T>
+        inline batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type q = abs(self / other);
+            const batch_type z = detail::kernel_atan(q, batch_type(1.) / q);
+            return select(other > batch_type(0.), z, constants::pi<batch_type>() - z) * signnz(self);
+        }
+
+        // cos
+        namespace detail
+        {
+            template <class T, class A>
+            inline batch<T, A> quadrant(const batch<T, A>& x) noexcept
+            {
+                return x & batch<T, A>(3);
+            }
+
+            template <class A>
+            inline batch<float, A> quadrant(const batch<float, A>& x) noexcept
+            {
+                return to_float(quadrant(to_int(x)));
+            }
+
+            template <class A>
+            inline batch<double, A> quadrant(const batch<double, A>& x) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type a = x * batch_type(0.25);
+                return (a - floor(a)) * batch_type(4.);
+            }
+            /* origin: boost/simd/arch/common/detail/simd/f_trig_evaluation.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+
+            template <class A>
+            inline batch<float, A> cos_eval(const batch<float, A>& z) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0x3d2aaaa5,
+                                              0xbab60619,
+                                              0x37ccf5ce>(z);
+                return batch_type(1.) + fma(z, batch_type(-0.5), y * z * z);
+            }
+
+            template <class A>
+            inline batch<float, A> sin_eval(const batch<float, A>& z, const batch<float, A>& x) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0xbe2aaaa2,
+                                              0x3c08839d,
+                                              0xb94ca1f9>(z);
+                return fma(y * z, x, x);
+            }
+
+            template <class A>
+            static inline batch<float, A> base_tancot_eval(const batch<float, A>& z) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type zz = z * z;
+                batch_type y = detail::horner<batch_type,
+                                              0x3eaaaa6f,
+                                              0x3e0896dd,
+                                              0x3d5ac5c9,
+                                              0x3cc821b5,
+                                              0x3b4c779c,
+                                              0x3c19c53b>(zz);
+                return fma(y, zz * z, z);
+            }
+
+            template <class A, class BB>
+            static inline batch<float, A> tan_eval(const batch<float, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, y, -batch_type(1.) / y);
+            }
+
+            template <class A, class BB>
+            static inline batch<float, A> cot_eval(const batch<float, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, batch_type(1.) / y, -y);
+            }
+
+            /* origin: boost/simd/arch/common/detail/simd/d_trig_evaluation.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class A>
+            static inline batch<double, A> cos_eval(const batch<double, A>& z) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0x3fe0000000000000ull,
+                                              0xbfa5555555555551ull,
+                                              0x3f56c16c16c15d47ull,
+                                              0xbefa01a019ddbcd9ull,
+                                              0x3e927e4f8e06d9a5ull,
+                                              0xbe21eea7c1e514d4ull,
+                                              0x3da8ff831ad9b219ull>(z);
+                return batch_type(1.) - y * z;
+            }
+
+            template <class A>
+            static inline batch<double, A> sin_eval(const batch<double, A>& z, const batch<double, A>& x) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0xbfc5555555555548ull,
+                                              0x3f8111111110f7d0ull,
+                                              0xbf2a01a019bfdf03ull,
+                                              0x3ec71de3567d4896ull,
+                                              0xbe5ae5e5a9291691ull,
+                                              0x3de5d8fd1fcf0ec1ull>(z);
+                return fma(y * z, x, x);
+            }
+
+            template <class A>
+            static inline batch<double, A> base_tancot_eval(const batch<double, A>& z) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type zz = z * z;
+                batch_type num = detail::horner<batch_type,
+                                                0xc1711fead3299176ull,
+                                                0x413199eca5fc9dddull,
+                                                0xc0c992d8d24f3f38ull>(zz);
+                batch_type den = detail::horner1<batch_type,
+                                                 0xc189afe03cbe5a31ull,
+                                                 0x4177d98fc2ead8efull,
+                                                 0xc13427bc582abc96ull,
+                                                 0x40cab8a5eeb36572ull>(zz);
+                return fma(z, (zz * (num / den)), z);
+            }
+
+            template <class A, class BB>
+            static inline batch<double, A> tan_eval(const batch<double, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, y, -batch_type(1.) / y);
+            }
+
+            template <class A, class BB>
+            static inline batch<double, A> cot_eval(const batch<double, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, batch_type(1.) / y, -y);
+            }
+            /* origin: boost/simd/arch/common/detail/simd/trig_reduction.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+
+            struct trigo_radian_tag
+            {
+            };
+            struct trigo_pi_tag
+            {
+            };
+
+            template <class B, class Tag = trigo_radian_tag>
+            struct trigo_reducer
+            {
+                static inline B reduce(const B& x, B& xr) noexcept
+                {
+                    if (all(x <= constants::pio4<B>()))
+                    {
+                        xr = x;
+                        return B(0.);
+                    }
+                    else if (all(x <= constants::pio2<B>()))
+                    {
+                        auto test = x > constants::pio4<B>();
+                        xr = x - constants::pio2_1<B>();
+                        xr -= constants::pio2_2<B>();
+                        xr -= constants::pio2_3<B>();
+                        xr = select(test, xr, x);
+                        return select(test, B(1.), B(0.));
+                    }
+                    else if (all(x <= constants::twentypi<B>()))
+                    {
+                        B xi = nearbyint(x * constants::twoopi<B>());
+                        xr = fnma(xi, constants::pio2_1<B>(), x);
+                        xr -= xi * constants::pio2_2<B>();
+                        xr -= xi * constants::pio2_3<B>();
+                        return quadrant(xi);
+                    }
+                    else if (all(x <= constants::mediumpi<B>()))
+                    {
+                        B fn = nearbyint(x * constants::twoopi<B>());
+                        B r = x - fn * constants::pio2_1<B>();
+                        B w = fn * constants::pio2_1t<B>();
+                        B t = r;
+                        w = fn * constants::pio2_2<B>();
+                        r = t - w;
+                        w = fn * constants::pio2_2t<B>() - ((t - r) - w);
+                        t = r;
+                        w = fn * constants::pio2_3<B>();
+                        r = t - w;
+                        w = fn * constants::pio2_3t<B>() - ((t - r) - w);
+                        xr = r - w;
+                        return quadrant(fn);
+                    }
+                    else
+                    {
+                        static constexpr std::size_t size = B::size;
+                        using value_type = typename B::value_type;
+                        alignas(B) std::array<value_type, size> tmp;
+                        alignas(B) std::array<value_type, size> txr;
+                        alignas(B) std::array<value_type, size> args;
+                        x.store_aligned(args.data());
+
+                        for (std::size_t i = 0; i < size; ++i)
+                        {
+                            double arg = args[i];
+                            if (arg == std::numeric_limits<value_type>::infinity())
+                            {
+                                tmp[i] = 0.;
+                                txr[i] = std::numeric_limits<value_type>::quiet_NaN();
+                            }
+                            else
+                            {
+                                double y[2];
+                                std::int32_t n = ::xsimd::detail::__ieee754_rem_pio2(arg, y);
+                                tmp[i] = value_type(n & 3);
+                                txr[i] = value_type(y[0]);
+                            }
+                        }
+                        xr = B::load_aligned(&txr[0]);
+                        B res = B::load_aligned(&tmp[0]);
+                        return res;
+                    }
+                }
+            };
+
+            template <class B>
+            struct trigo_reducer<B, trigo_pi_tag>
+            {
+                static inline B reduce(const B& x, B& xr) noexcept
+                {
+                    B xi = nearbyint(x * B(2.));
+                    B x2 = x - xi * B(0.5);
+                    xr = x2 * constants::pi<B>();
+                    return quadrant(xi);
+                }
+            };
+
+        }
+        template <class A, class T>
+        inline batch<T, A> cos(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type x = abs(self);
+            batch_type xr = constants::nan<batch_type>();
+            const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
+            auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+            auto swap_bit = fma(batch_type(-2.), tmp, n);
+            auto sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+            const batch_type z = xr * xr;
+            const batch_type se = detail::sin_eval(z, xr);
+            const batch_type ce = detail::cos_eval(z);
+            const batch_type z1 = select(swap_bit != batch_type(0.), se, ce);
+            return z1 ^ sign_bit;
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> cos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            return { cos(z.real()) * cosh(z.imag()), -sin(z.real()) * sinh(z.imag()) };
+        }
+
+        // cosh
+
+        /* origin: boost/simd/arch/common/simd/function/cosh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+
+        template <class A, class T>
+        inline batch<T, A> cosh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = abs(self);
+            auto test1 = x > (constants::maxlog<batch_type>() - constants::log_2<batch_type>());
+            batch_type fac = select(test1, batch_type(0.5), batch_type(1.));
+            batch_type tmp = exp(x * fac);
+            batch_type tmp1 = batch_type(0.5) * tmp;
+            return select(test1, tmp1 * tmp, detail::average(tmp, batch_type(1.) / tmp));
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> cosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            auto x = z.real();
+            auto y = z.imag();
+            return { cosh(x) * cos(y), sinh(x) * sin(y) };
+        }
+
+        // sin
+        namespace detail
+        {
+            template <class A, class T, class Tag = trigo_radian_tag>
+            inline batch<T, A> sin(batch<T, A> const& self, Tag = Tag()) noexcept
+            {
+                using batch_type = batch<T, A>;
+                const batch_type x = abs(self);
+                batch_type xr = constants::nan<batch_type>();
+                const batch_type n = detail::trigo_reducer<batch_type, Tag>::reduce(x, xr);
+                auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+                auto swap_bit = fma(batch_type(-2.), tmp, n);
+                auto sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+                const batch_type z = xr * xr;
+                const batch_type se = detail::sin_eval(z, xr);
+                const batch_type ce = detail::cos_eval(z);
+                const batch_type z1 = select(swap_bit == batch_type(0.), se, ce);
+                return z1 ^ sign_bit;
+            }
+        }
+
+        template <class A, class T>
+        inline batch<T, A> sin(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::sin(self);
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> sin(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            return { sin(z.real()) * cosh(z.imag()), cos(z.real()) * sinh(z.imag()) };
+        }
+
+        // sincos
+        template <class A, class T>
+        inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type x = abs(self);
+            batch_type xr = constants::nan<batch_type>();
+            const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
+            auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+            auto swap_bit = fma(batch_type(-2.), tmp, n);
+            const batch_type z = xr * xr;
+            const batch_type se = detail::sin_eval(z, xr);
+            const batch_type ce = detail::cos_eval(z);
+            auto sin_sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+            const batch_type sin_z1 = select(swap_bit == batch_type(0.), se, ce);
+            auto cos_sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+            const batch_type cos_z1 = select(swap_bit != batch_type(0.), se, ce);
+            return std::make_pair(sin_z1 ^ sin_sign_bit, cos_z1 ^ cos_sign_bit);
+        }
+
+        template <class A, class T>
+        inline std::pair<batch<std::complex<T>, A>, batch<std::complex<T>, A>>
+        sincos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch rcos = cos(z.real());
+            real_batch rsin = sin(z.real());
+            real_batch icosh = cosh(z.imag());
+            real_batch isinh = sinh(z.imag());
+            return std::make_pair(batch_type(rsin * icosh, rcos * isinh), batch_type(rcos * icosh, -rsin * isinh));
+        }
+
+        // sinh
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/sinh_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class A>
+            inline batch<float, A> sinh_kernel(batch<float, A> const& self) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type sqr_self = self * self;
+                return detail::horner<batch_type,
+                                      0x3f800000, // 1.0f
+                                      0x3e2aaacc, // 1.66667160211E-1f
+                                      0x3c087bbe, // 8.33028376239E-3f
+                                      0x39559e2f // 2.03721912945E-4f
+                                      >(sqr_self)
+                    * self;
+            }
+
+            template <class A>
+            inline batch<double, A> sinh_kernel(batch<double, A> const& self) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type sqrself = self * self;
+                return fma(self, (detail::horner<batch_type,
+                                                 0xc115782bdbf6ab05ull, //  -3.51754964808151394800E5
+                                                 0xc0c694b8c71d6182ull, //  -1.15614435765005216044E4,
+                                                 0xc064773a398ff4feull, //  -1.63725857525983828727E2,
+                                                 0xbfe9435fe8bb3cd6ull //  -7.89474443963537015605E-1
+                                                 >(sqrself)
+                                  / detail::horner1<batch_type,
+                                                    0xc1401a20e4f90044ull, //  -2.11052978884890840399E6
+                                                    0x40e1a7ba7ed72245ull, //   3.61578279834431989373E4,
+                                                    0xc0715b6096e96484ull //  -2.77711081420602794433E2,
+                                                    >(sqrself))
+                               * sqrself,
+                           self);
+            }
+        }
+        /* origin: boost/simd/arch/common/simd/function/sinh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> sinh(batch<T, A> const& a, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type half(0.5);
+            batch_type x = abs(a);
+            auto lt1 = x < batch_type(1.);
+            batch_type bts = bitofsign(a);
+            batch_type z(0.);
+            if (any(lt1))
+            {
+                z = detail::sinh_kernel(x);
+                if (all(lt1))
+                    return z ^ bts;
+            }
+            auto test1 = x > (constants::maxlog<batch_type>() - constants::log_2<batch_type>());
+            batch_type fac = select(test1, half, batch_type(1.));
+            batch_type tmp = exp(x * fac);
+            batch_type tmp1 = half * tmp;
+            batch_type r = select(test1, tmp1 * tmp, tmp1 - half / tmp);
+            return select(lt1, z, r) ^ bts;
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> sinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            auto x = z.real();
+            auto y = z.imag();
+            return { sinh(x) * cos(y), cosh(x) * sin(y) };
+        }
+
+        // tan
+        template <class A, class T>
+        inline batch<T, A> tan(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type x = abs(self);
+            batch_type xr = constants::nan<batch_type>();
+            const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
+            auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+            auto swap_bit = fma(batch_type(-2.), tmp, n);
+            auto test = (swap_bit == batch_type(0.));
+            const batch_type y = detail::tan_eval(xr, test);
+            return y ^ bitofsign(self);
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> tan(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch d = cos(2 * z.real()) + cosh(2 * z.imag());
+            batch_type winf(constants::infinity<real_batch>(), constants::infinity<real_batch>());
+            real_batch wreal = sin(2 * z.real()) / d;
+            real_batch wimag = sinh(2 * z.imag());
+            batch_type wres = select(isinf(wimag), batch_type(wreal, real_batch(1.)), batch_type(wreal, wimag / d));
+            return select(d == real_batch(0.), winf, wres);
+        }
+
+        // tanh
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/tanh_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            struct tanh_kernel;
+
+            template <class A>
+            struct tanh_kernel<batch<float, A>>
+            {
+                using batch_type = batch<float, A>;
+                static inline batch_type tanh(const batch_type& x) noexcept
+                {
+                    batch_type sqrx = x * x;
+                    return fma(detail::horner<batch_type,
+                                              0xbeaaaa99, //    -3.33332819422E-1F
+                                              0x3e088393, //    +1.33314422036E-1F
+                                              0xbd5c1e2d, //    -5.37397155531E-2F
+                                              0x3ca9134e, //    +2.06390887954E-2F
+                                              0xbbbaf0ea //    -5.70498872745E-3F
+                                              >(sqrx)
+                                   * sqrx,
+                               x, x);
+                }
+
+                static inline batch_type cotanh(const batch_type& x) noexcept
+                {
+                    return batch_type(1.) / tanh(x);
+                }
+            };
+
+            template <class A>
+            struct tanh_kernel<batch<double, A>>
+            {
+                using batch_type = batch<double, A>;
+                static inline batch_type tanh(const batch_type& x) noexcept
+                {
+                    batch_type sqrx = x * x;
+                    return fma(sqrx * p(sqrx) / q(sqrx), x, x);
+                }
+
+                static inline batch_type cotanh(const batch_type& x) noexcept
+                {
+                    batch_type sqrx = x * x;
+                    batch_type qval = q(sqrx);
+                    return qval / (x * fma(p(sqrx), sqrx, qval));
+                }
+
+                static inline batch_type p(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0xc0993ac030580563, // -1.61468768441708447952E3
+                                          0xc058d26a0e26682d, // -9.92877231001918586564E1,
+                                          0xbfeedc5baafd6f4b // -9.64399179425052238628E-1
+                                          >(x);
+                }
+
+                static inline batch_type q(const batch_type& x) noexcept
+                {
+                    return detail::horner1<batch_type,
+                                           0x40b2ec102442040c, //  4.84406305325125486048E3
+                                           0x40a176fa0e5535fa, //  2.23548839060100448583E3,
+                                           0x405c33f28a581B86 //  1.12811678491632931402E2,
+                                           >(x);
+                }
+            };
+
+        }
+        /* origin: boost/simd/arch/common/simd/function/tanh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> tanh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type one(1.);
+            batch_type x = abs(self);
+            auto test = x < (batch_type(5.) / batch_type(8.));
+            batch_type bts = bitofsign(self);
+            batch_type z = one;
+            if (any(test))
+            {
+                z = detail::tanh_kernel<batch_type>::tanh(x);
+                if (all(test))
+                    return z ^ bts;
+            }
+            batch_type r = fma(batch_type(-2.), one / (one + exp(x + x)), one);
+            return select(test, z, r) ^ bts;
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> tanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using real_batch = typename batch<std::complex<T>, A>::real_batch;
+            auto x = z.real();
+            auto y = z.imag();
+            real_batch two(2);
+            auto d = cosh(two * x) + cos(two * y);
+            return { sinh(two * x) / d, sin(two * y) / d };
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
new file mode 100644
index 0000000000..be1da61358
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
@@ -0,0 +1,1657 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX_HPP
+#define XSIMD_AVX_HPP
+
+#include <complex>
+#include <limits>
+#include <type_traits>
+
+#include "../types/xsimd_avx_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // fwd
+        template <class A, class T, size_t I>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
+
+        namespace detail
+        {
+            inline void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept
+            {
+                low = _mm256_castsi256_si128(val);
+                high = _mm256_extractf128_si256(val, 1);
+            }
+            inline void split_avx(__m256 val, __m128& low, __m128& high) noexcept
+            {
+                low = _mm256_castps256_ps128(val);
+                high = _mm256_extractf128_ps(val, 1);
+            }
+            inline void split_avx(__m256d val, __m128d& low, __m128d& high) noexcept
+            {
+                low = _mm256_castpd256_pd128(val);
+                high = _mm256_extractf128_pd(val, 1);
+            }
+            inline __m256i merge_sse(__m128i low, __m128i high) noexcept
+            {
+                return _mm256_insertf128_si256(_mm256_castsi128_si256(low), high, 1);
+            }
+            inline __m256 merge_sse(__m128 low, __m128 high) noexcept
+            {
+                return _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1);
+            }
+            inline __m256d merge_sse(__m128d low, __m128d high) noexcept
+            {
+                return _mm256_insertf128_pd(_mm256_castpd128_pd256(low), high, 1);
+            }
+            template <class F>
+            inline __m256i fwd_to_sse(F f, __m256i self) noexcept
+            {
+                __m128i self_low, self_high;
+                split_avx(self, self_low, self_high);
+                __m128i res_low = f(self_low);
+                __m128i res_high = f(self_high);
+                return merge_sse(res_low, res_high);
+            }
+            template <class F>
+            inline __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept
+            {
+                __m128i self_low, self_high, other_low, other_high;
+                split_avx(self, self_low, self_high);
+                split_avx(other, other_low, other_high);
+                __m128i res_low = f(self_low, other_low);
+                __m128i res_high = f(self_high, other_high);
+                return merge_sse(res_low, res_high);
+            }
+            template <class F>
+            inline __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept
+            {
+                __m128i self_low, self_high;
+                split_avx(self, self_low, self_high);
+                __m128i res_low = f(self_low, other);
+                __m128i res_high = f(self_high, other);
+                return merge_sse(res_low, res_high);
+            }
+        }
+
+        // abs
+        template <class A>
+        inline batch<float, A> abs(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            __m256 sign_mask = _mm256_set1_ps(-0.f); // -0.f = 1 << 31
+            return _mm256_andnot_ps(sign_mask, self);
+        }
+        template <class A>
+        inline batch<double, A> abs(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            __m256d sign_mask = _mm256_set1_pd(-0.f); // -0.f = 1 << 31
+            return _mm256_andnot_pd(sign_mask, self);
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return add(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A>
+        inline batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_add_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_add_pd(self, other);
+        }
+
+        // all
+        template <class A>
+        inline bool all(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_testc_ps(self, batch_bool<float, A>(true)) != 0;
+        }
+        template <class A>
+        inline bool all(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_testc_pd(self, batch_bool<double, A>(true)) != 0;
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline bool all(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_testc_si256(self, batch_bool<T, A>(true)) != 0;
+        }
+
+        // any
+        template <class A>
+        inline bool any(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return !_mm256_testz_ps(self, self);
+        }
+        template <class A>
+        inline bool any(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return !_mm256_testz_pd(self, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline bool any(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return !_mm256_testz_si256(self, self);
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in>
+        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx>) noexcept
+        {
+            return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
+        }
+
+        // bitwise_and
+        template <class A>
+        inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_and_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_and_pd(self, other);
+        }
+
+        template <class A>
+        inline batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_and_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_and_pd(self, other);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+
+        // bitwise_andnot
+        template <class A>
+        inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_andnot_ps(other, self);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_andnot_pd(other, self);
+        }
+
+        template <class A>
+        inline batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_andnot_ps(other, self);
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_andnot_pd(other, self);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_andnot(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_andnot(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept
+                                      { return bitwise_lshift(batch<T, sse4_2>(s), o, sse4_2 {}); },
+                                      self, other);
+        }
+
+        // bitwise_not
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s) noexcept
+                                      { return bitwise_not(batch<T, sse4_2>(s), sse4_2 {}); },
+                                      self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s) noexcept
+                                      { return bitwise_not(batch_bool<T, sse4_2>(s), sse4_2 {}); },
+                                      self);
+        }
+
+        // bitwise_or
+        template <class A>
+        inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_or_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_or_pd(self, other);
+        }
+        template <class A>
+        inline batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_or_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_or_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_or(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_or(batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o)); },
+                                      self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept
+                                      { return bitwise_rshift(batch<T, sse4_2>(s), o, sse4_2 {}); },
+                                      self, other);
+        }
+
+        // bitwise_xor
+        template <class A>
+        inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, other);
+        }
+        template <class A>
+        inline batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_xor(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
+                                      self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_xor(batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o), sse4_2 {}); },
+                                      self, other);
+        }
+
+        // bitwise_cast
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castsi256_ps(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castsi256_pd(self);
+        }
+        template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
+        inline batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<avx>) noexcept
+        {
+            return batch<Tp, A>(self.data);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castps_pd(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castps_si256(self);
+        }
+        template <class A>
+        inline batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castpd_ps(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castpd_si256(self);
+        }
+
+        // bitwise_not
+        template <class A>
+        inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
+        }
+        template <class A>
+        inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
+        }
+        template <class A>
+        inline batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
+        }
+
+        // broadcast
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> broadcast(T val, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_set1_epi8(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_set1_epi16(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_set1_epi32(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_set1_epi64x(val);
+            }
+            else
+            {
+                assert(false && "unsupported");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<float, A> broadcast(float val, requires_arch<avx>) noexcept
+        {
+            return _mm256_set1_ps(val);
+        }
+        template <class A>
+        inline batch<double, A> broadcast(double val, requires_arch<avx>) noexcept
+        {
+            return _mm256_set1_pd(val);
+        }
+
+        // ceil
+        template <class A>
+        inline batch<float, A> ceil(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_ceil_ps(self);
+        }
+        template <class A>
+        inline batch<double, A> ceil(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_ceil_pd(self);
+        }
+
+        namespace detail
+        {
+            // On clang, _mm256_extractf128_ps is built upon build_shufflevector
+            // which require index parameter to be a constant
+            template <int index, class B>
+            inline B get_half_complex_f(const B& real, const B& imag) noexcept
+            {
+                __m128 tmp0 = _mm256_extractf128_ps(real, index);
+                __m128 tmp1 = _mm256_extractf128_ps(imag, index);
+                __m128 tmp2 = _mm_unpackhi_ps(tmp0, tmp1);
+                tmp0 = _mm_unpacklo_ps(tmp0, tmp1);
+                __m256 res = real;
+                res = _mm256_insertf128_ps(res, tmp0, 0);
+                res = _mm256_insertf128_ps(res, tmp2, 1);
+                return res;
+            }
+            template <int index, class B>
+            inline B get_half_complex_d(const B& real, const B& imag) noexcept
+            {
+                __m128d tmp0 = _mm256_extractf128_pd(real, index);
+                __m128d tmp1 = _mm256_extractf128_pd(imag, index);
+                __m128d tmp2 = _mm_unpackhi_pd(tmp0, tmp1);
+                tmp0 = _mm_unpacklo_pd(tmp0, tmp1);
+                __m256d res = real;
+                res = _mm256_insertf128_pd(res, tmp0, 0);
+                res = _mm256_insertf128_pd(res, tmp2, 1);
+                return res;
+            }
+
+            // complex_low
+            template <class A>
+            inline batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
+            {
+                return get_half_complex_f<0>(self.real(), self.imag());
+            }
+            template <class A>
+            inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
+            {
+                return get_half_complex_d<0>(self.real(), self.imag());
+            }
+
+            // complex_high
+            template <class A>
+            inline batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
+            {
+                return get_half_complex_f<1>(self.real(), self.imag());
+            }
+            template <class A>
+            inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
+            {
+                return get_half_complex_d<1>(self.real(), self.imag());
+            }
+        }
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A>
+            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
+            {
+                return _mm256_cvtepi32_ps(self);
+            }
+
+            template <class A>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx>) noexcept
+            {
+                // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
+                // adapted to avx
+                __m256i msk_lo = _mm256_set1_epi32(0xFFFF);
+                __m256 cnst65536f = _mm256_set1_ps(65536.0f);
+
+                __m256i v_lo = bitwise_and(batch<uint32_t, A>(v), batch<uint32_t, A>(msk_lo)); /* extract the 16 lowest significant bits of self                             */
+                __m256i v_hi = bitwise_rshift(batch<uint32_t, A>(v), 16, avx {}); /* 16 most significant bits of v                                                 */
+                __m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding                                                                   */
+                __m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding                                                                   */
+                v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding                                                                   */
+                return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer    */
+            }
+
+            template <class A>
+            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx>) noexcept
+            {
+                return _mm256_cvttps_epi32(self);
+            }
+
+            template <class A>
+            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<avx>) noexcept
+            {
+                return _mm256_castps_si256(
+                    _mm256_blendv_ps(_mm256_castsi256_ps(_mm256_cvttps_epi32(self)),
+                                     _mm256_xor_ps(
+                                         _mm256_castsi256_ps(_mm256_cvttps_epi32(_mm256_sub_ps(self, _mm256_set1_ps(1u << 31)))),
+                                         _mm256_castsi256_ps(_mm256_set1_epi32(1u << 31))),
+                                     _mm256_cmp_ps(self, _mm256_set1_ps(1u << 31), _CMP_GE_OQ)));
+            }
+        }
+
+        // div
+        template <class A>
+        inline batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_div_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_div_pd(self, other);
+        }
+
+        // eq
+        template <class A>
+        inline batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, other, _CMP_EQ_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, other, _CMP_EQ_OQ);
+        }
+        template <class A>
+        inline batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return ~(self != other);
+        }
+        template <class A>
+        inline batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return ~(self != other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return eq(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
+                                      self, other);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return ~(self != other);
+        }
+
+        // floor
+        template <class A>
+        inline batch<float, A> floor(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_floor_ps(self);
+        }
+        template <class A>
+        inline batch<double, A> floor(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_floor_pd(self);
+        }
+
+        // from_mask
+        template <class A>
+        inline batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<avx>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut32[] = {
+                0x0000000000000000ul,
+                0x00000000FFFFFFFFul,
+                0xFFFFFFFF00000000ul,
+                0xFFFFFFFFFFFFFFFFul,
+            };
+            assert(!(mask & ~0xFFul) && "inbound mask");
+            return _mm256_castsi256_ps(_mm256_setr_epi64x(lut32[mask & 0x3], lut32[(mask >> 2) & 0x3], lut32[(mask >> 4) & 0x3], lut32[mask >> 6]));
+        }
+        template <class A>
+        inline batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<avx>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut64[][4] = {
+                { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+            };
+            assert(!(mask & ~0xFul) && "inbound mask");
+            return _mm256_castsi256_pd(_mm256_load_si256((const __m256i*)lut64[mask]));
+        }
+        template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx>) noexcept
+        {
+            alignas(A::alignment()) static const uint32_t lut32[] = {
+                0x00000000,
+                0x000000FF,
+                0x0000FF00,
+                0x0000FFFF,
+                0x00FF0000,
+                0x00FF00FF,
+                0x00FFFF00,
+                0x00FFFFFF,
+                0xFF000000,
+                0xFF0000FF,
+                0xFF00FF00,
+                0xFF00FFFF,
+                0xFFFF0000,
+                0xFFFF00FF,
+                0xFFFFFF00,
+                0xFFFFFFFF,
+            };
+            alignas(A::alignment()) static const uint64_t lut64[] = {
+                0x0000000000000000ul,
+                0x000000000000FFFFul,
+                0x00000000FFFF0000ul,
+                0x00000000FFFFFFFFul,
+                0x0000FFFF00000000ul,
+                0x0000FFFF0000FFFFul,
+                0x0000FFFFFFFF0000ul,
+                0x0000FFFFFFFFFFFFul,
+                0xFFFF000000000000ul,
+                0xFFFF00000000FFFFul,
+                0xFFFF0000FFFF0000ul,
+                0xFFFF0000FFFFFFFFul,
+                0xFFFFFFFF00000000ul,
+                0xFFFFFFFF0000FFFFul,
+                0xFFFFFFFFFFFF0000ul,
+                0xFFFFFFFFFFFFFFFFul,
+            };
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                assert(!(mask & ~0xFFFFFFFFul) && "inbound mask");
+                return _mm256_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF],
+                                         lut32[(mask >> 8) & 0xF], lut32[(mask >> 12) & 0xF],
+                                         lut32[(mask >> 16) & 0xF], lut32[(mask >> 20) & 0xF],
+                                         lut32[(mask >> 24) & 0xF], lut32[mask >> 28]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                assert(!(mask & ~0xFFFFul) && "inbound mask");
+                return _mm256_setr_epi64x(lut64[mask & 0xF], lut64[(mask >> 4) & 0xF], lut64[(mask >> 8) & 0xF], lut64[(mask >> 12) & 0xF]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_castps_si256(from_mask(batch_bool<float, A> {}, mask, avx {}));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_castpd_si256(from_mask(batch_bool<double, A> {}, mask, avx {}));
+            }
+        }
+
+        // haddp
+        template <class A>
+        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx>) noexcept
+        {
+            // row = (a,b,c,d,e,f,g,h)
+            // tmp0 = (a0+a1, a2+a3, b0+b1, b2+b3, a4+a5, a6+a7, b4+b5, b6+b7)
+            __m256 tmp0 = _mm256_hadd_ps(row[0], row[1]);
+            // tmp1 = (c0+c1, c2+c3, d1+d2, d2+d3, c4+c5, c6+c7, d4+d5, d6+d7)
+            __m256 tmp1 = _mm256_hadd_ps(row[2], row[3]);
+            // tmp1 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3,
+            // a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7)
+            tmp1 = _mm256_hadd_ps(tmp0, tmp1);
+            // tmp0 = (e0+e1, e2+e3, f0+f1, f2+f3, e4+e5, e6+e7, f4+f5, f6+f7)
+            tmp0 = _mm256_hadd_ps(row[4], row[5]);
+            // tmp2 = (g0+g1, g2+g3, h0+h1, h2+h3, g4+g5, g6+g7, h4+h5, h6+h7)
+            __m256 tmp2 = _mm256_hadd_ps(row[6], row[7]);
+            // tmp2 = (e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3,
+            // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7)
+            tmp2 = _mm256_hadd_ps(tmp0, tmp2);
+            // tmp0 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3,
+            // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7)
+            tmp0 = _mm256_blend_ps(tmp1, tmp2, 0b11110000);
+            // tmp1 = (a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7,
+            // e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3)
+            tmp1 = _mm256_permute2f128_ps(tmp1, tmp2, 0x21);
+            return _mm256_add_ps(tmp0, tmp1);
+        }
+        template <class A>
+        inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<avx>) noexcept
+        {
+            // row = (a,b,c,d)
+            // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3)
+            __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]);
+            // tmp1 = (c0+c1, d0+d1, c2+c3, d2+d3)
+            __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]);
+            // tmp2 = (a0+a1, b0+b1, c2+c3, d2+d3)
+            __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100);
+            // tmp1 = (a2+a3, b2+b3, c2+c3, d2+d3)
+            tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21);
+            return _mm256_add_pd(tmp1, tmp2);
+        }
+
+        // insert
+        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx>) noexcept
+        {
+#if !defined(_MSC_VER) || _MSC_VER > 1900
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_insert_epi8(self, val, I);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_insert_epi16(self, val, I);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_insert_epi32(self, val, I);
+            }
+            else
+            {
+                return insert(self, val, pos, generic {});
+            }
+#endif
+            return insert(self, val, pos, generic {});
+        }
+
+        // isnan
+        template <class A>
+        inline batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, self, _CMP_UNORD_Q);
+        }
+        template <class A>
+        inline batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, self, _CMP_UNORD_Q);
+        }
+
+        // le
+        template <class A>
+        inline batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, other, _CMP_LE_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, other, _CMP_LE_OQ);
+        }
+
+        // load_aligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
+        {
+            return _mm256_load_si256((__m256i const*)mem);
+        }
+        template <class A>
+        inline batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
+        {
+            return _mm256_load_ps(mem);
+        }
+        template <class A>
+        inline batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
+        {
+            return _mm256_load_pd(mem);
+        }
+
+        namespace detail
+        {
+            // load_complex
+            template <class A>
+            inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx>) noexcept
+            {
+                using batch_type = batch<float, A>;
+                __m128 tmp0 = _mm256_extractf128_ps(hi, 0);
+                __m128 tmp1 = _mm256_extractf128_ps(hi, 1);
+                __m128 tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0));
+                __m128 tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+                batch_type real = _mm256_castps128_ps256(tmp_real);
+                batch_type imag = _mm256_castps128_ps256(tmp_imag);
+
+                tmp0 = _mm256_extractf128_ps(lo, 0);
+                tmp1 = _mm256_extractf128_ps(lo, 1);
+                tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0));
+                tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+                real = _mm256_insertf128_ps(real, tmp_real, 1);
+                imag = _mm256_insertf128_ps(imag, tmp_imag, 1);
+                return { real, imag };
+            }
+            template <class A>
+            inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx>) noexcept
+            {
+                using batch_type = batch<double, A>;
+                __m128d tmp0 = _mm256_extractf128_pd(hi, 0);
+                __m128d tmp1 = _mm256_extractf128_pd(hi, 1);
+                batch_type real = _mm256_castpd128_pd256(_mm_unpacklo_pd(tmp0, tmp1));
+                batch_type imag = _mm256_castpd128_pd256(_mm_unpackhi_pd(tmp0, tmp1));
+
+                tmp0 = _mm256_extractf128_pd(lo, 0);
+                tmp1 = _mm256_extractf128_pd(lo, 1);
+                __m256d re_tmp1 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 1);
+                __m256d im_tmp1 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 1);
+                real = _mm256_blend_pd(real, re_tmp1, 12);
+                imag = _mm256_blend_pd(imag, im_tmp1, 12);
+                return { real, imag };
+            }
+        }
+
+        // load_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
+        {
+            return _mm256_loadu_si256((__m256i const*)mem);
+        }
+        template <class A>
+        inline batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
+        {
+            return _mm256_loadu_ps(mem);
+        }
+        template <class A>
+        inline batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
+        {
+            return _mm256_loadu_pd(mem);
+        }
+
+        // lt
+        template <class A>
+        inline batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, other, _CMP_LT_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, other, _CMP_LT_OQ);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return lt(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+
+        // mask
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
+            {
+                __m128i self_low, self_high;
+                detail::split_avx(self, self_low, self_high);
+                return mask(batch_bool<T, sse4_2>(self_low), sse4_2 {}) | (mask(batch_bool<T, sse4_2>(self_high), sse4_2 {}) << (128 / (8 * sizeof(T))));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_movemask_ps(_mm256_castsi256_ps(self));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_movemask_pd(_mm256_castsi256_pd(self));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline uint64_t mask(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_movemask_ps(self);
+        }
+
+        template <class A>
+        inline uint64_t mask(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_movemask_pd(self);
+        }
+
+        // max
+        template <class A>
+        inline batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_max_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_max_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return select(self > other, self, other);
+        }
+
+        // min
+        template <class A>
+        inline batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_min_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_min_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return select(self <= other, self, other);
+        }
+
+        // mul
+        template <class A>
+        inline batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_mul_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_mul_pd(self, other);
+        }
+
+        // nearbyint
+        template <class A>
+        inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
+        }
+        template <class A>
+        inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                  requires_arch<avx>) noexcept
+        {
+            return _mm256_cvtps_epi32(self);
+        }
+
+        // neg
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> neg(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return 0 - self;
+        }
+        template <class A>
+        batch<float, A> neg(batch<float, A> const& self, requires_arch<avx>)
+        {
+            return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));
+        }
+        template <class A>
+        inline batch<double, A> neg(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000)));
+        }
+
+        // neq
+        template <class A>
+        inline batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, other, _CMP_NEQ_UQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, other, _CMP_NEQ_UQ);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return ~(self == other);
+        }
+
+        template <class A>
+        inline batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(self.data), _mm256_castsi256_ps(other.data)));
+        }
+
+        // reciprocal
+        template <class A>
+        inline batch<float, A> reciprocal(batch<float, A> const& self,
+                                          kernel::requires_arch<avx>) noexcept
+        {
+            return _mm256_rcp_ps(self);
+        }
+
+        // reduce_add
+        template <class A>
+        inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx>) noexcept
+        {
+            // Warning about _mm256_hadd_ps:
+            // _mm256_hadd_ps(a,b) gives
+            // (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't
+            // rely on a naive use of this method
+            // rhs = (x0, x1, x2, x3, x4, x5, x6, x7)
+            // tmp = (x4, x5, x6, x7, x0, x1, x2, x3)
+            __m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1);
+            // tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7)
+            tmp = _mm256_add_ps(rhs, tmp);
+            // tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -)
+            tmp = _mm256_hadd_ps(tmp, tmp);
+            // tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -)
+            tmp = _mm256_hadd_ps(tmp, tmp);
+            return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0));
+        }
+        template <class A>
+        inline double reduce_add(batch<double, A> const& rhs, requires_arch<avx>) noexcept
+        {
+            // rhs = (x0, x1, x2, x3)
+            // tmp = (x2, x3, x0, x1)
+            __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1);
+            // tmp = (x2+x0, x3+x1, -, -)
+            tmp = _mm256_add_pd(rhs, tmp);
+            // tmp = (x2+x0+x3+x1, -, -, -)
+            tmp = _mm256_hadd_pd(tmp, tmp);
+            return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            __m128i low, high;
+            detail::split_avx(self, low, high);
+            batch<T, sse4_2> blow(low), bhigh(high);
+            return reduce_add(blow) + reduce_add(bhigh);
+        }
+
+        // reduce_max
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        inline T reduce_max(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            constexpr auto mask = detail::shuffle(1, 0);
+            batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
+            batch<T, A> acc = max(self, step);
+            __m128i low = _mm256_castsi256_si128(acc);
+            return reduce_max(batch<T, sse4_2>(low));
+        }
+
+        // reduce_min
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        inline T reduce_min(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            constexpr auto mask = detail::shuffle(1, 0);
+            batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
+            batch<T, A> acc = min(self, step);
+            __m128i low = _mm256_castsi256_si128(acc);
+            return reduce_min(batch<T, sse4_2>(low));
+        }
+
+        // rsqrt
+        template <class A>
+        inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
+        {
+            return _mm256_rsqrt_ps(val);
+        }
+        template <class A>
+        inline batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
+        {
+            return _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(val)));
+        }
+
+        // sadd
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                auto mask = (other >> (8 * sizeof(T) - 1));
+                auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
+                auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
+                return other + select(batch_bool<T, A>(mask.data), self_neg_branch, self_pos_branch);
+            }
+            else
+            {
+                const auto diffmax = std::numeric_limits<T>::max() - self;
+                const auto mindiff = min(diffmax, other);
+                return self + mindiff;
+            }
+        }
+
+        // select
+        template <class A>
+        inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            return _mm256_blendv_ps(false_br, true_br, cond);
+        }
+        template <class A>
+        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            return _mm256_blendv_pd(false_br, true_br, cond);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            __m128i cond_low, cond_hi;
+            detail::split_avx(cond, cond_low, cond_hi);
+
+            __m128i true_low, true_hi;
+            detail::split_avx(true_br, true_low, true_hi);
+
+            __m128i false_low, false_hi;
+            detail::split_avx(false_br, false_low, false_hi);
+
+            __m128i res_low = select(batch_bool<T, sse4_2>(cond_low), batch<T, sse4_2>(true_low), batch<T, sse4_2>(false_low), sse4_2 {});
+            __m128i res_hi = select(batch_bool<T, sse4_2>(cond_hi), batch<T, sse4_2>(true_hi), batch<T, sse4_2>(false_hi), sse4_2 {});
+            return detail::merge_sse(res_low, res_hi);
+        }
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
+        }
+
+        template <class A, bool... Values>
+        inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            constexpr auto mask = batch_bool_constant<batch<float, A>, Values...>::mask();
+            return _mm256_blend_ps(false_br, true_br, mask);
+        }
+
+        template <class A, bool... Values>
+        inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            constexpr auto mask = batch_bool_constant<batch<double, A>, Values...>::mask();
+            return _mm256_blend_pd(false_br, true_br, mask);
+        }
+
+        // set
+        template <class A, class... Values>
+        inline batch<float, A> set(batch<float, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
+            return _mm256_setr_ps(values...);
+        }
+
+        template <class A, class... Values>
+        inline batch<double, A> set(batch<double, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
+            return _mm256_setr_pd(values...);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3) noexcept
+        {
+            return _mm256_set_epi64x(v3, v2, v1, v0);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+        {
+            return _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        {
+            return _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                               T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
+        {
+            return _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+        }
+
+        template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
+        }
+
+        template <class A, class... Values>
+        inline batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
+            return _mm256_castsi256_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
+        }
+
+        template <class A, class... Values>
+        inline batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
+            return _mm256_castsi256_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 256)
+            {
+                return batch<T, A>(T(0));
+            }
+            if (BitCount > 128)
+            {
+                constexpr unsigned M = (BitCount - 128) / 8;
+                __m128i low = _mm256_castsi256_si128(x);
+                auto y = _mm_slli_si128(low, M);
+                __m256i zero = _mm256_setzero_si256();
+                return _mm256_insertf128_si256(zero, y, 1);
+            }
+            if (BitCount == 128)
+            {
+                __m128i low = _mm256_castsi256_si128(x);
+                __m256i zero = _mm256_setzero_si256();
+                return _mm256_insertf128_si256(zero, low, 1);
+            }
+            // shifting by [0, 128[ bits
+            constexpr unsigned M = BitCount / 8;
+
+            __m128i low = _mm256_castsi256_si128(x);
+            auto ylow = _mm_slli_si128(low, M);
+            auto zlow = _mm_srli_si128(low, 16 - M);
+
+            __m128i high = _mm256_extractf128_si256(x, 1);
+            auto yhigh = _mm_slli_si128(high, M);
+
+            __m256i res = _mm256_castsi128_si256(ylow);
+            return _mm256_insertf128_si256(res, _mm_or_si128(yhigh, zlow), 1);
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 256)
+            {
+                return batch<T, A>(T(0));
+            }
+            if (BitCount > 128)
+            {
+                constexpr unsigned M = (BitCount - 128) / 8;
+                __m128i high = _mm256_extractf128_si256(x, 1);
+                __m128i y = _mm_srli_si128(high, M);
+                __m256i zero = _mm256_setzero_si256();
+                return _mm256_insertf128_si256(zero, y, 0);
+            }
+            if (BitCount == 128)
+            {
+                __m128i high = _mm256_extractf128_si256(x, 1);
+                return _mm256_castsi128_si256(high);
+            }
+            // shifting by [0, 128[ bits
+            constexpr unsigned M = BitCount / 8;
+
+            __m128i low = _mm256_castsi256_si128(x);
+            auto ylow = _mm_srli_si128(low, M);
+
+            __m128i high = _mm256_extractf128_si256(x, 1);
+            auto yhigh = _mm_srli_si128(high, M);
+            auto zhigh = _mm_slli_si128(high, 16 - M);
+
+            __m256i res = _mm256_castsi128_si256(_mm_or_si128(ylow, zhigh));
+            return _mm256_insertf128_si256(res, yhigh, 1);
+        }
+
+        // sqrt
+        template <class A>
+        inline batch<float, A> sqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
+        {
+            return _mm256_sqrt_ps(val);
+        }
+        template <class A>
+        inline batch<double, A> sqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
+        {
+            return _mm256_sqrt_pd(val);
+        }
+
+        // ssub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                return sadd(self, -other);
+            }
+            else
+            {
+                const auto diff = min(self, other);
+                return self - diff;
+            }
+        }
+
+        // store_aligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_aligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_store_si256((__m256i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_store_si256((__m256i*)mem, self);
+        }
+        template <class A>
+        inline void store_aligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_store_ps(mem, self);
+        }
+        template <class A>
+        inline void store_aligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_store_pd(mem, self);
+        }
+
+        // store_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_storeu_si256((__m256i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_storeu_si256((__m256i*)mem, self);
+        }
+        template <class A>
+        inline void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_storeu_ps(mem, self);
+        }
+        template <class A>
+        inline void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_storeu_pd(mem, self);
+        }
+
+        // sub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return sub(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A>
+        inline batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_sub_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_sub_pd(self, other);
+        }
+
+        // swizzle
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
+        {
+            // duplicate low and high part of input
+            __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
+            __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0);
+
+            __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self));
+            __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
+
+            // normalize mask
+            batch_constant<batch<uint32_t, A>, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
+
+            // permute within each lane
+            __m256 r0 = _mm256_permutevar_ps(low_low, (batch<uint32_t, A>)half_mask);
+            __m256 r1 = _mm256_permutevar_ps(hi_hi, (batch<uint32_t, A>)half_mask);
+
+            // mask to choose the right lane
+            batch_bool_constant<batch<uint32_t, A>, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
+
+            // blend the two permutes
+            constexpr auto mask = blend_mask.mask();
+            return _mm256_blend_ps(r0, r1, mask);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx>) noexcept
+        {
+            // duplicate low and high part of input
+            __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
+            __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0);
+
+            __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self));
+            __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
+
+            // normalize mask
+            batch_constant<batch<uint64_t, A>, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
+
+            // permute within each lane
+            __m256d r0 = _mm256_permutevar_pd(low_low, (batch<uint64_t, A>)half_mask);
+            __m256d r1 = _mm256_permutevar_pd(hi_hi, (batch<uint64_t, A>)half_mask);
+
+            // mask to choose the right lane
+            batch_bool_constant<batch<uint64_t, A>, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
+
+            // blend the two permutes
+            constexpr auto mask = blend_mask.mask();
+            return _mm256_blend_pd(r0, r1, mask);
+        }
+        template <class A,
+                  typename T,
+                  uint32_t V0,
+                  uint32_t V1,
+                  uint32_t V2,
+                  uint32_t V3,
+                  uint32_t V4,
+                  uint32_t V5,
+                  uint32_t V6,
+                  uint32_t V7,
+                  detail::enable_sized_integral_t<T, 4> = 0>
+        inline batch<T, A> swizzle(batch<T, A> const& self,
+                                   batch_constant<batch<uint32_t, A>,
+                                                  V0,
+                                                  V1,
+                                                  V2,
+                                                  V3,
+                                                  V4,
+                                                  V5,
+                                                  V6,
+                                                  V7> const& mask,
+                                   requires_arch<avx>) noexcept
+        {
+            return bitwise_cast<T>(
+                swizzle(bitwise_cast<float>(self), mask));
+        }
+
+        template <class A,
+                  typename T,
+                  uint64_t V0,
+                  uint64_t V1,
+                  uint64_t V2,
+                  uint64_t V3,
+                  detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch<T, A>
+        swizzle(batch<T, A> const& self,
+                batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> const& mask,
+                requires_arch<avx>) noexcept
+        {
+            return bitwise_cast<T>(
+                swizzle(bitwise_cast<double>(self), mask));
+        }
+
+        // trunc
+        template <class A>
+        inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_round_ps(self, _MM_FROUND_TO_ZERO);
+        }
+        template <class A>
+        inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_round_pd(self, _MM_FROUND_TO_ZERO);
+        }
+
+        // zip_hi
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
+            {
+                // extract high word
+                __m128i self_hi = _mm256_extractf128_si256(self, 1);
+                __m128i other_hi = _mm256_extractf128_si256(other, 1);
+
+                // interleave
+                __m128i res_lo, res_hi;
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    res_lo = _mm_unpacklo_epi8(self_hi, other_hi);
+                    res_hi = _mm_unpackhi_epi8(self_hi, other_hi);
+                }
+                else
+                {
+                    res_lo = _mm_unpacklo_epi16(self_hi, other_hi);
+                    res_hi = _mm_unpackhi_epi16(self_hi, other_hi);
+                }
+
+                // fuse
+                return _mm256_castps_si256(
+                    _mm256_insertf128_ps(
+                        _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)),
+                        _mm_castsi128_ps(res_hi),
+                        1));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+                auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+                return _mm256_castps_si256(_mm256_permute2f128_ps(lo, hi, 0x31));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+                auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+                return _mm256_castpd_si256(_mm256_permute2f128_pd(lo, hi, 0x31));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            auto lo = _mm256_unpacklo_ps(self, other);
+            auto hi = _mm256_unpackhi_ps(self, other);
+            return _mm256_permute2f128_ps(lo, hi, 0x31);
+        }
+        template <class A>
+        inline batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            auto lo = _mm256_unpacklo_pd(self, other);
+            auto hi = _mm256_unpackhi_pd(self, other);
+            return _mm256_permute2f128_pd(lo, hi, 0x31);
+        }
+
+        // zip_lo
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
+            {
+                // extract low word
+                __m128i self_lo = _mm256_extractf128_si256(self, 0);
+                __m128i other_lo = _mm256_extractf128_si256(other, 0);
+
+                // interleave
+                __m128i res_lo, res_hi;
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    res_lo = _mm_unpacklo_epi8(self_lo, other_lo);
+                    res_hi = _mm_unpackhi_epi8(self_lo, other_lo);
+                }
+                else
+                {
+                    res_lo = _mm_unpacklo_epi16(self_lo, other_lo);
+                    res_hi = _mm_unpackhi_epi16(self_lo, other_lo);
+                }
+
+                // fuse
+                return _mm256_castps_si256(
+                    _mm256_insertf128_ps(
+                        _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)),
+                        _mm_castsi128_ps(res_hi),
+                        1));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+                auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+                return _mm256_castps_si256(_mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+                auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+                return _mm256_castpd_si256(_mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        template <class A>
+        inline batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            auto lo = _mm256_unpacklo_ps(self, other);
+            auto hi = _mm256_unpackhi_ps(self, other);
+            return _mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1);
+        }
+        template <class A>
+        inline batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            auto lo = _mm256_unpacklo_pd(self, other);
+            auto hi = _mm256_unpackhi_pd(self, other);
+            return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1);
+        }
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
new file mode 100644
index 0000000000..8d0fcc27a4
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
@@ -0,0 +1,950 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX2_HPP
+#define XSIMD_AVX2_HPP
+
+#include <complex>
+#include <type_traits>
+
+#include "../types/xsimd_avx2_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // abs
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_abs_epi8(self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_abs_epi16(self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_abs_epi32(self);
+                }
+                else
+                {
+                    return abs(self, avx {});
+                }
+            }
+            return self;
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_add_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_add_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_add_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_add_epi64(self, other);
+            }
+            else
+            {
+                return add(self, other, avx {});
+            }
+        }
+
+        // bitwise_and
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_and_si256(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_and_si256(self, other);
+        }
+
+        // bitwise_andnot
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_andnot_si256(other, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_andnot_si256(other, self);
+        }
+
+        // bitwise_not
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_slli_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_slli_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_slli_epi64(self, other);
+            }
+            else
+            {
+                return bitwise_lshift(self, other, avx {});
+            }
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_sllv_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_sllv_epi64(self, other);
+            }
+            else
+            {
+                return bitwise_lshift(self, other, avx {});
+            }
+        }
+
+        // bitwise_or
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_or_si256(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_or_si256(self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF);
+                    __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self);
+                    __m256i res = _mm256_srai_epi16(self, other);
+                    return _mm256_or_si256(
+                        detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                           { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
+                                           sign_mask, cmp_is_negative),
+                        _mm256_andnot_si256(sign_mask, res));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_srai_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_srai_epi32(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_srli_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_srli_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm256_srli_epi64(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx {});
+                }
+            }
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_srav_epi32(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_srlv_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm256_srlv_epi64(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx {});
+                }
+            }
+        }
+
+        // bitwise_xor
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_xor_si256(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_xor_si256(self, other);
+        }
+
+        // complex_low
+        template <class A>
+        inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
+        {
+            __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0));
+            __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0));
+            return _mm256_blend_pd(tmp0, tmp1, 10);
+        }
+
+        // complex_high
+        template <class A>
+        inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
+        {
+            __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2));
+            __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0));
+            return _mm256_blend_pd(tmp0, tmp1, 10);
+        }
+
+        // fast_cast
+        namespace detail
+        {
+
+            template <class A>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx2>) noexcept
+            {
+                // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
+                __m256i msk_lo = _mm256_set1_epi32(0xFFFF);
+                __m256 cnst65536f = _mm256_set1_ps(65536.0f);
+
+                __m256i v_lo = _mm256_and_si256(v, msk_lo); /* extract the 16 lowest significant bits of self                             */
+                __m256i v_hi = _mm256_srli_epi32(v, 16); /* 16 most significant bits of v                                                 */
+                __m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding                                                                   */
+                __m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding                                                                   */
+                v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding                                                                   */
+                return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer    */
+            }
+
+            template <class A>
+            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to avx
+                __m256i xH = _mm256_srli_epi64(x, 32);
+                xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); //  2^84
+                __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000,
+                                                 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
+                __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); //  2^52
+                __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
+                return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
+            }
+
+            template <class A>
+            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to avx
+                __m256i xH = _mm256_srai_epi32(x, 16);
+                xH = _mm256_and_si256(xH, _mm256_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
+                xH = _mm256_add_epi64(xH, _mm256_castpd_si256(_mm256_set1_pd(442721857769029238784.))); //  3*2^67
+                __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000,
+                                                 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
+                __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); //  2^52
+                __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
+                return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
+            }
+        }
+
+        // eq
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_cmpeq_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_cmpeq_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_cmpeq_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_cmpeq_epi64(self, other);
+            }
+            else
+            {
+                return eq(self, other, avx {});
+            }
+        }
+
+        // gather
+        template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
+        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                  kernel::requires_arch<avx2>) noexcept
+        {
+            // scatter for this one is AVX512F+AVX512VL
+            return _mm256_i32gather_epi32(reinterpret_cast<const int*>(src), index, sizeof(T));
+        }
+
+        template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
+        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                  kernel::requires_arch<avx2>) noexcept
+        {
+            // scatter for this one is AVX512F+AVX512VL
+            return _mm256_i64gather_epi64(reinterpret_cast<const long long int*>(src), index, sizeof(T));
+        }
+
+        template <class A, class U,
+                  detail::enable_sized_integral_t<U, 4> = 0>
+        inline batch<float, A> gather(batch<float, A> const&, float const* src,
+                                      batch<U, A> const& index,
+                                      kernel::requires_arch<avx2>) noexcept
+        {
+            // scatter for this one is AVX512F+AVX512VL
+            return _mm256_i32gather_ps(src, index, sizeof(float));
+        }
+
+        template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
+        inline batch<double, A> gather(batch<double, A> const&, double const* src,
+                                       batch<U, A> const& index,
+                                       requires_arch<avx2>) noexcept
+        {
+            // scatter for this one is AVX512F+AVX512VL
+            return _mm256_i64gather_pd(src, index, sizeof(double));
+        }
+
+        // gather: handmade conversions
+        template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
+        inline batch<float, A> gather(batch<float, A> const&, double const* src,
+                                      batch<V, A> const& index,
+                                      requires_arch<avx2>) noexcept
+        {
+            const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
+            const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
+            return detail::merge_sse(_mm256_cvtpd_ps(low.data), _mm256_cvtpd_ps(high.data));
+        }
+
+        template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
+        inline batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
+                                        batch<V, A> const& index,
+                                        requires_arch<avx2>) noexcept
+        {
+            const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
+            const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
+            return detail::merge_sse(_mm256_cvtpd_epi32(low.data), _mm256_cvtpd_epi32(high.data));
+        }
+
+        // lt
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_cmpgt_epi8(other, self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_cmpgt_epi16(other, self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_cmpgt_epi32(other, self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm256_cmpgt_epi64(other, self);
+                }
+                else
+                {
+                    return lt(self, other, avx {});
+                }
+            }
+            else
+            {
+                return lt(self, other, avx {});
+            }
+        }
+
+        // load_complex
+        template <class A>
+        inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx2>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type real = _mm256_castpd_ps(
+                _mm256_permute4x64_pd(
+                    _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))),
+                    _MM_SHUFFLE(3, 1, 2, 0)));
+            batch_type imag = _mm256_castpd_ps(
+                _mm256_permute4x64_pd(
+                    _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))),
+                    _MM_SHUFFLE(3, 1, 2, 0)));
+            return { real, imag };
+        }
+        template <class A>
+        inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx2>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
+            batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
+            return { real, imag };
+        }
+        // mask
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                uint64_t mask8 = 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
+                return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4) | (detail::mask_lut(mask8 >> 16) << 8) | (detail::mask_lut(mask8 >> 24) << 12);
+            }
+            else
+            {
+                return mask(self, avx {});
+            }
+        }
+
+        // max
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_max_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_max_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_max_epi32(self, other);
+                }
+                else
+                {
+                    return max(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_max_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_max_epu16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_max_epu32(self, other);
+                }
+                else
+                {
+                    return max(self, other, avx {});
+                }
+            }
+        }
+
+        // min
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_min_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_min_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_min_epi32(self, other);
+                }
+                else
+                {
+                    return min(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_min_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_min_epu16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_min_epu32(self, other);
+                }
+                else
+                {
+                    return min(self, other, avx {});
+                }
+            }
+        }
+
+        // mul
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                __m256i mask_hi = _mm256_set1_epi32(0xFF00FF00);
+                __m256i res_lo = _mm256_mullo_epi16(self, other);
+                __m256i other_hi = _mm256_srli_epi16(other, 8);
+                __m256i self_hi = _mm256_and_si256(self, mask_hi);
+                __m256i res_hi = _mm256_mullo_epi16(self_hi, other_hi);
+                __m256i res = _mm256_blendv_epi8(res_lo, res_hi, mask_hi);
+                return res;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_mullo_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_mullo_epi32(self, other);
+            }
+            else
+            {
+                return mul(self, other, avx {});
+            }
+        }
+
+        // reduce_add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                __m256i tmp1 = _mm256_hadd_epi32(self, self);
+                __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1);
+                __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
+                __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3);
+                return _mm_cvtsi128_si32(tmp4);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                __m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E);
+                __m256i tmp2 = _mm256_add_epi64(self, tmp1);
+                __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
+                __m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3);
+#if defined(__x86_64__)
+                return _mm_cvtsi128_si64(res);
+#else
+                __m128i m;
+                _mm_storel_epi64(&m, res);
+                int64_t i;
+                std::memcpy(&i, &m, sizeof(i));
+                return i;
+#endif
+            }
+            else
+            {
+                return reduce_add(self, avx {});
+            }
+        }
+
+        // sadd
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_adds_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_adds_epi16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_adds_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_adds_epu16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, avx {});
+                }
+            }
+        }
+
+        // select
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_blendv_epi8(false_br, true_br, cond);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_blendv_epi8(false_br, true_br, cond);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_blendv_epi8(false_br, true_br, cond);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_blendv_epi8(false_br, true_br, cond);
+            }
+            else
+            {
+                return select(cond, true_br, false_br, avx {});
+            }
+        }
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
+        {
+            constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
+            // FIXME: for some reason mask here is not considered as an immediate,
+            // but it's okay for _mm256_blend_epi32
+            // case 2: return _mm256_blend_epi16(false_br, true_br, mask);
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_blend_epi32(false_br, true_br, mask);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                constexpr int imask = detail::interleave(mask);
+                return _mm256_blend_epi32(false_br, true_br, imask);
+            }
+            else
+            {
+                return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
+            }
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx2>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 256)
+            {
+                return batch<T, A>(T(0));
+            }
+            if (BitCount > 128)
+            {
+                constexpr unsigned M = (BitCount - 128) / 8;
+                auto y = _mm256_bslli_epi128(x, M);
+                return _mm256_permute2x128_si256(y, y, 0x28);
+            }
+            if (BitCount == 128)
+            {
+                return _mm256_permute2x128_si256(x, x, 0x28);
+            }
+            // shifting by [0, 128[ bits
+            constexpr unsigned M = BitCount / 8;
+            auto y = _mm256_bslli_epi128(x, M);
+            auto z = _mm256_bsrli_epi128(x, 16 - M);
+            auto w = _mm256_permute2x128_si256(z, z, 0x28);
+            return _mm256_or_si256(y, w);
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx2>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 256)
+            {
+                return batch<T, A>(T(0));
+            }
+            if (BitCount > 128)
+            {
+                constexpr unsigned M = (BitCount - 128) / 8;
+                auto y = _mm256_bsrli_epi128(x, M);
+                return _mm256_permute2x128_si256(y, y, 0x81);
+            }
+            if (BitCount == 128)
+            {
+                return _mm256_permute2x128_si256(x, x, 0x81);
+            }
+            // shifting by [0, 128[ bits
+            constexpr unsigned M = BitCount / 8;
+            auto y = _mm256_bsrli_epi128(x, M);
+            auto z = _mm256_bslli_epi128(x, 16 - M);
+            auto w = _mm256_permute2x128_si256(z, z, 0x81);
+            return _mm256_or_si256(y, w);
+        }
+
+        // ssub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_subs_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_subs_epi16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_subs_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_subs_epu16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, avx {});
+                }
+            }
+        }
+
+        // sub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_sub_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_sub_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_sub_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_sub_epi64(self, other);
+            }
+            else
+            {
+                return sub(self, other, avx {});
+            }
+        }
+
+        // swizzle
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        {
+            return _mm256_permutevar8x32_ps(self, (batch<uint32_t, A>)mask);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+        {
+            constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
+            return _mm256_permute4x64_pd(self, mask);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+        {
+            constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
+            return _mm256_permute4x64_epi64(self, mask);
+        }
+        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
+        {
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx2 {}));
+        }
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        {
+            return _mm256_permutevar8x32_epi32(self, (batch<uint32_t, A>)mask);
+        }
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        {
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
+        }
+
+        // zip_hi
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto lo = _mm256_unpacklo_epi8(self, other);
+                auto hi = _mm256_unpackhi_epi8(self, other);
+                return _mm256_permute2f128_si256(lo, hi, 0x31);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto lo = _mm256_unpacklo_epi16(self, other);
+                auto hi = _mm256_unpackhi_epi16(self, other);
+                return _mm256_permute2f128_si256(lo, hi, 0x31);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                auto lo = _mm256_unpacklo_epi32(self, other);
+                auto hi = _mm256_unpackhi_epi32(self, other);
+                return _mm256_permute2f128_si256(lo, hi, 0x31);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                auto lo = _mm256_unpacklo_epi64(self, other);
+                auto hi = _mm256_unpackhi_epi64(self, other);
+                return _mm256_permute2f128_si256(lo, hi, 0x31);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // zip_lo
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto lo = _mm256_unpacklo_epi8(self, other);
+                auto hi = _mm256_unpackhi_epi8(self, other);
+                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto lo = _mm256_unpacklo_epi16(self, other);
+                auto hi = _mm256_unpackhi_epi16(self, other);
+                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                auto lo = _mm256_unpacklo_epi32(self, other);
+                auto hi = _mm256_unpackhi_epi32(self, other);
+                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                auto lo = _mm256_unpacklo_epi64(self, other);
+                auto hi = _mm256_unpackhi_epi64(self, other);
+                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
new file mode 100644
index 0000000000..77182e1ef2
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
@@ -0,0 +1,627 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512BW_HPP
+#define XSIMD_AVX512BW_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "../types/xsimd_avx512bw_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        namespace detail
+        {
+            template <class A, class T, int Cmp>
+            inline batch_bool<T, A> compare_int_avx512bw(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                using register_type = typename batch_bool<T, A>::register_type;
+                if (std::is_signed<T>::value)
+                {
+                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                    {
+                        return (register_type)_mm512_cmp_epi8_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                    {
+                        return (register_type)_mm512_cmp_epi16_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                    {
+                        return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                    {
+                        return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp);
+                    }
+                }
+                else
+                {
+                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                    {
+                        return (register_type)_mm512_cmp_epu8_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                    {
+                        return (register_type)_mm512_cmp_epu16_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                    {
+                        return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                    {
+                        return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp);
+                    }
+                }
+            }
+        }
+
+        // abs
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_unsigned<T>::value)
+            {
+                return self;
+            }
+
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_abs_epi8(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_abs_epi16(self);
+            }
+            else
+            {
+                return abs(self, avx512dq {});
+            }
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_add_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_add_epi16(self, other);
+            }
+            else
+            {
+                return add(self, other, avx512dq {});
+            }
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
+        {
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_sllv_epi16(self, _mm512_set1_epi16(other));
+#else
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_slli_epi16(self, other);
+#endif
+            }
+            else
+            {
+                return bitwise_lshift(self, other, avx512dq {});
+            }
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    __m512i sign_mask = _mm512_set1_epi16((0xFF00 >> other) & 0x00FF);
+                    __m512i zeros = _mm512_setzero_si512();
+                    __mmask64 cmp_is_negative_mask = _mm512_cmpgt_epi8_mask(zeros, self);
+                    __m512i cmp_sign_mask = _mm512_mask_blend_epi8(cmp_is_negative_mask, zeros, sign_mask);
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                    __m512i res = _mm512_srav_epi16(self, _mm512_set1_epi16(other));
+#else
+                    __m512i res = _mm512_srai_epi16(self, other);
+#endif
+                    return _mm512_or_si512(cmp_sign_mask, _mm512_andnot_si512(sign_mask, res));
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_srav_epi16(self, _mm512_set1_epi16(other));
+#else
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_srai_epi16(self, other);
+#endif
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_srlv_epi16(self, _mm512_set1_epi16(other));
+#else
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_srli_epi16(self, other);
+#endif
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // eq
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_EQ>(self, other);
+        }
+
+        // ge
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GE>(self, other);
+        }
+
+        // gt
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GT>(self, other);
+        }
+
+        // le
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LE>(self, other);
+        }
+
+        // lt
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LT>(self, other);
+        }
+
+        // max
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_max_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_max_epi16(self, other);
+                }
+                else
+                {
+                    return max(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_max_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_max_epu16(self, other);
+                }
+                else
+                {
+                    return max(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // min
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_min_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_min_epi16(self, other);
+                }
+                else
+                {
+                    return min(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_min_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_min_epu16(self, other);
+                }
+                else
+                {
+                    return min(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // mul
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                __m512i upper = _mm512_and_si512(_mm512_mullo_epi16(self, other), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8));
+                __m512i lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(self, 8), _mm512_srli_epi16(other, 8)), 8);
+                return _mm512_or_si512(upper, lower);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_mullo_epi16(self, other);
+            }
+            else
+            {
+                return mul(self, other, avx512dq {});
+            }
+        }
+
+        // neq
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_NE>(self, other);
+        }
+
+        // sadd
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_adds_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_adds_epi16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_adds_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_adds_epu16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // select
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_mask_blend_epi8(cond, false_br.data, true_br.data);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_mask_blend_epi16(cond, false_br.data, true_br.data);
+            }
+            else
+            {
+                return select(cond, true_br, false_br, avx512dq {});
+            }
+        }
+
+        // slide_left
+        namespace detail
+        {
+            template <size_t... Is>
+            constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_hi(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is == 0 ? 8 : Is - 1)... };
+            }
+
+            template <size_t N, size_t... Is>
+            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_pattern(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is >= N ? Is - N : 0)... };
+            }
+            template <size_t N, size_t... Is>
+            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_mask(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is >= N ? 0xFFFF : 0x0000)... };
+            }
+        }
+
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 512)
+            {
+                return batch<T, A>(T(0));
+            }
+            batch<T, A> xx;
+            if (N & 1)
+            {
+                alignas(A::alignment()) uint64_t buffer[8];
+                _mm512_store_epi64(&buffer[0], x);
+                for (int i = 7; i > 0; --i)
+                    buffer[i] = (buffer[i] << 8) | (buffer[i - 1] >> 56);
+                buffer[0] = buffer[0] << 8;
+                xx = _mm512_load_epi64(&buffer[0]);
+
+                alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_hi(::xsimd::detail::make_index_sequence<512 / 64>());
+                __m512i xl = _mm512_slli_epi64(x, 8);
+                __m512i xr = _mm512_srli_epi64(x, 56);
+                xr = _mm512_permutex2var_epi64(xr, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
+                xx = _mm512_or_si512(xr, xl);
+                if (N == 1)
+                    return xx;
+            }
+            else
+            {
+                xx = x;
+            }
+            alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
+            alignas(A::alignment()) auto slide_mask = detail::make_slide_left_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
+            return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
+        }
+
+        // slide_right
+        namespace detail
+        {
+            template <size_t... Is>
+            constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_low(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is + 1)... };
+            }
+
+            template <size_t N, size_t... Is>
+            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_pattern(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is < (32 - N) ? Is + N : 0)... };
+            }
+            template <size_t N, size_t... Is>
+            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_mask(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is < 32 - N ? 0xFFFF : 0x0000)... };
+            }
+        }
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 512)
+            {
+                return batch<T, A>(T(0));
+            }
+            batch<T, A> xx;
+            if (N & 1)
+            {
+                alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_low(::xsimd::detail::make_index_sequence<512 / 64>());
+                __m512i xr = _mm512_srli_epi64(x, 8);
+                __m512i xl = _mm512_slli_epi64(x, 56);
+                xl = _mm512_permutex2var_epi64(xl, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
+                xx = _mm512_or_si512(xr, xl);
+                if (N == 1)
+                    return xx;
+            }
+            else
+            {
+                xx = x;
+            }
+            alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
+            alignas(A::alignment()) auto slide_mask = detail::make_slide_right_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
+            return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
+        }
+
+        // ssub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_subs_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_subs_epi16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_subs_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_subs_epu16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // sub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_sub_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_sub_epi16(self, other);
+            }
+            else
+            {
+                return sub(self, other, avx512dq {});
+            }
+        }
+
+        // swizzle
+
+        template <class A, uint16_t... Vs>
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        {
+            return _mm512_permutexvar_epi16((batch<uint16_t, A>)mask, self);
+        }
+
+        template <class A, uint16_t... Vs>
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        {
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512bw {}));
+        }
+
+        template <class A, uint8_t... Vs>
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        {
+            return _mm512_shuffle_epi8(self, (batch<uint8_t, A>)mask);
+        }
+
+        template <class A, uint8_t... Vs>
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        {
+            return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, avx512bw {}));
+        }
+
+        // zip_hi
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            __m512i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                lo = _mm512_unpacklo_epi8(self, other);
+                hi = _mm512_unpackhi_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                lo = _mm512_unpacklo_epi16(self, other);
+                hi = _mm512_unpackhi_epi16(self, other);
+            }
+            else
+            {
+                return zip_hi(self, other, avx512f {});
+            }
+            return _mm512_inserti32x4(
+                _mm512_inserti32x4(
+                    _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0),
+                    _mm512_extracti32x4_epi32(lo, 3),
+                    2),
+                _mm512_extracti32x4_epi32(hi, 2),
+                1);
+        }
+
+        // zip_lo
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            __m512i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                lo = _mm512_unpacklo_epi8(self, other);
+                hi = _mm512_unpackhi_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                lo = _mm512_unpacklo_epi16(self, other);
+                hi = _mm512_unpackhi_epi16(self, other);
+            }
+            else
+            {
+                return zip_lo(self, other, avx512f {});
+            }
+            return _mm512_inserti32x4(
+                _mm512_inserti32x4(
+                    _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1),
+                    _mm512_extracti32x4_epi32(hi, 1),
+                    3),
+                _mm512_extracti32x4_epi32(lo, 1),
+                2);
+        }
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp
new file mode 100644
index 0000000000..95f3f1df8f
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp
@@ -0,0 +1,28 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512CD_HPP
+#define XSIMD_AVX512CD_HPP
+
+#include "../types/xsimd_avx512cd_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        // Nothing there yet.
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp
new file mode 100644
index 0000000000..7840ea8fc5
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp
@@ -0,0 +1,212 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512_DQHPP
+#define XSIMD_AVX512_D_HPP
+
+#include "../types/xsimd_avx512dq_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // bitwise_and
+        template <class A>
+        inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_and_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_and_pd(self, other);
+        }
+
+        // bitwise_andnot
+        template <class A>
+        inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_andnot_ps(other, self);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_andnot_pd(other, self);
+        }
+
+        // bitwise_not
+        template <class A>
+        inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
+        }
+        template <class A>
+        inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1)));
+        }
+
+        // bitwise_or
+        template <class A>
+        inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_or_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_or_pd(self, other);
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data | other.data);
+        }
+
+        // bitwise_xor
+        template <class A>
+        inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_xor_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_xor_pd(self, other);
+        }
+
+        // haddp
+        template <class A>
+        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512dq>) noexcept
+        {
+            // The following folds over the vector once:
+            // tmp1 = [a0..8, b0..8]
+            // tmp2 = [a8..f, b8..f]
+#define XSIMD_AVX512_HADDP_STEP1(I, a, b)                                \
+    batch<float, avx512f> res##I;                                        \
+    {                                                                    \
+        auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
+        auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
+        res##I = _mm512_add_ps(tmp1, tmp2);                              \
+    }
+
+            XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
+            XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]);
+            XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]);
+            XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]);
+            XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]);
+            XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]);
+            XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]);
+            XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]);
+
+#undef XSIMD_AVX512_HADDP_STEP1
+
+            // The following flds the code and shuffles so that hadd_ps produces the correct result
+            // tmp1 = [a0..4,  a8..12,  b0..4,  b8..12] (same for tmp3)
+            // tmp2 = [a5..8, a12..16, b5..8, b12..16]  (same for tmp4)
+            // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ...
+#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d)                               \
+    batch<float, avx2> halfx##I;                                              \
+    {                                                                         \
+        auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0));      \
+        auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1));      \
+                                                                              \
+        auto resx1 = _mm512_add_ps(tmp1, tmp2);                               \
+                                                                              \
+        auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0));      \
+        auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1));      \
+                                                                              \
+        auto resx2 = _mm512_add_ps(tmp3, tmp4);                               \
+                                                                              \
+        auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \
+        auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \
+                                                                              \
+        auto resx3 = _mm512_add_ps(tmp5, tmp6);                               \
+                                                                              \
+        halfx##I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0),           \
+                                  _mm512_extractf32x8_ps(resx3, 1));          \
+    }
+
+            XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3);
+            XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7);
+
+#undef XSIMD_AVX512_HADDP_STEP2
+
+            auto concat = _mm512_castps256_ps512(halfx0);
+            concat = _mm512_insertf32x8(concat, halfx1, 1);
+            return concat;
+        }
+
+        // ldexp
+        template <class A>
+        inline batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_scalef_pd(self, _mm512_cvtepi64_pd(other));
+        }
+
+        // mul
+        template <class A>
+        inline batch<uint64_t, A> mul(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_mullo_epi64(self, other);
+        }
+
+        template <class A>
+        inline batch<int64_t, A> mul(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_mullo_epi64(self, other);
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
+                                                  requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_cvtpd_epi64(self);
+        }
+
+        // reduce_add
+        template <class A>
+        inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
+        {
+            __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
+            __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
+            __m256 res1 = _mm256_add_ps(tmp1, tmp2);
+            return reduce_add(batch<float, avx2>(res1), avx2 {});
+        }
+
+        // convert
+        namespace detail
+        {
+            template <class A>
+            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx512dq>) noexcept
+            {
+                return _mm512_cvtepi64_pd(self);
+            }
+
+            template <class A>
+            inline batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<avx512dq>) noexcept
+            {
+                return _mm512_cvttpd_epi64(self);
+            }
+
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
new file mode 100644
index 0000000000..7eea894137
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
@@ -0,0 +1,2028 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512F_HPP
+#define XSIMD_AVX512F_HPP
+
+#include <complex>
+#include <limits>
+#include <type_traits>
+
+#include "../types/xsimd_avx512f_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        namespace detail
+        {
+            inline void split_avx512(__m512 val, __m256& low, __m256& high) noexcept
+            {
+                low = _mm512_castps512_ps256(val);
+                high = _mm512_extractf32x8_ps(val, 1);
+            }
+            inline void split_avx512(__m512d val, __m256d& low, __m256d& high) noexcept
+            {
+                low = _mm512_castpd512_pd256(val);
+                high = _mm512_extractf64x4_pd(val, 1);
+            }
+            inline void split_avx512(__m512i val, __m256i& low, __m256i& high) noexcept
+            {
+                low = _mm512_castsi512_si256(val);
+                high = _mm512_extracti64x4_epi64(val, 1);
+            }
+            inline __m512i merge_avx(__m256i low, __m256i high) noexcept
+            {
+                return _mm512_inserti64x4(_mm512_castsi256_si512(low), high, 1);
+            }
+            inline __m512 merge_avx(__m256 low, __m256 high) noexcept
+            {
+                return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castpd256_pd512(_mm256_castps_pd(low)), _mm256_castps_pd(high), 1));
+            }
+            inline __m512d merge_avx(__m256d low, __m256d high) noexcept
+            {
+                return _mm512_insertf64x4(_mm512_castpd256_pd512(low), high, 1);
+            }
+            template <class F>
+            __m512i fwd_to_avx(F f, __m512i self)
+            {
+                __m256i self_low, self_high;
+                split_avx512(self, self_low, self_high);
+                __m256i res_low = f(self_low);
+                __m256i res_high = f(self_high);
+                return merge_avx(res_low, res_high);
+            }
+            template <class F>
+            __m512i fwd_to_avx(F f, __m512i self, __m512i other)
+            {
+                __m256i self_low, self_high, other_low, other_high;
+                split_avx512(self, self_low, self_high);
+                split_avx512(other, other_low, other_high);
+                __m256i res_low = f(self_low, other_low);
+                __m256i res_high = f(self_high, other_high);
+                return merge_avx(res_low, res_high);
+            }
+            template <class F>
+            __m512i fwd_to_avx(F f, __m512i self, int32_t other)
+            {
+                __m256i self_low, self_high;
+                split_avx512(self, self_low, self_high);
+                __m256i res_low = f(self_low, other);
+                __m256i res_high = f(self_high, other);
+                return merge_avx(res_low, res_high);
+            }
+        }
+        namespace detail
+        {
+
+            inline uint32_t morton(uint16_t x, uint16_t y) noexcept
+            {
+
+                static const unsigned short MortonTable256[256] = {
+                    0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015,
+                    0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055,
+                    0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115,
+                    0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155,
+                    0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415,
+                    0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455,
+                    0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515,
+                    0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555,
+                    0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015,
+                    0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055,
+                    0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115,
+                    0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155,
+                    0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415,
+                    0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455,
+                    0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515,
+                    0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555,
+                    0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015,
+                    0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055,
+                    0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115,
+                    0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155,
+                    0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415,
+                    0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455,
+                    0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515,
+                    0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555,
+                    0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015,
+                    0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055,
+                    0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115,
+                    0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155,
+                    0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415,
+                    0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455,
+                    0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515,
+                    0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555
+                };
+
+                uint32_t z = MortonTable256[y >> 8] << 17 | MortonTable256[x >> 8] << 16 | MortonTable256[y & 0xFF] << 1 | MortonTable256[x & 0xFF];
+                return z;
+            }
+
+            template <class A, class T, int Cmp>
+            inline batch_bool<T, A> compare_int_avx512f(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                using register_type = typename batch_bool<T, A>::register_type;
+                if (std::is_signed<T>::value)
+                {
+                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                    {
+                        // shifting to take sign into account
+                        uint64_t mask_low0 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x000000FF)) << 24,
+                                                                   (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x000000FF)) << 24,
+                                                                   Cmp);
+                        uint64_t mask_low1 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x0000FF00)) << 16,
+                                                                   (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x0000FF00)) << 16,
+                                                                   Cmp);
+                        uint64_t mask_high0 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x00FF0000)) << 8,
+                                                                    (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x00FF0000)) << 8,
+                                                                    Cmp);
+                        uint64_t mask_high1 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0xFF000000)),
+                                                                    (batch<int32_t, A>(other.data) & batch<int32_t, A>(0xFF000000)),
+                                                                    Cmp);
+                        uint64_t mask = 0;
+                        for (unsigned i = 0; i < 16; ++i)
+                        {
+                            mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0);
+                            mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1);
+                            mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2);
+                            mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3);
+                        }
+                        return (register_type)mask;
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                    {
+                        // shifting to take sign into account
+                        uint16_t mask_low = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x0000FFFF)) << 16,
+                                                                  (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x0000FFFF)) << 16,
+                                                                  Cmp);
+                        uint16_t mask_high = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0xFFFF0000)),
+                                                                   (batch<int32_t, A>(other.data) & batch<int32_t, A>(0xFFFF0000)),
+                                                                   Cmp);
+                        return static_cast<register_type>(morton(mask_low, mask_high));
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                    {
+                        return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                    {
+                        return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp);
+                    }
+                }
+                else
+                {
+                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                    {
+                        uint64_t mask_low0 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x000000FF)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x000000FF)), Cmp);
+                        uint64_t mask_low1 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x0000FF00)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x0000FF00)), Cmp);
+                        uint64_t mask_high0 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x00FF0000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x00FF0000)), Cmp);
+                        uint64_t mask_high1 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0xFF000000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0xFF000000)), Cmp);
+                        uint64_t mask = 0;
+                        for (unsigned i = 0; i < 16; ++i)
+                        {
+                            mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0);
+                            mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1);
+                            mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2);
+                            mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3);
+                        }
+                        return (register_type)mask;
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                    {
+                        uint16_t mask_low = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x0000FFFF)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x0000FFFF)), Cmp);
+                        uint16_t mask_high = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0xFFFF0000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0xFFFF0000)), Cmp);
+                        return static_cast<register_type>(morton(mask_low, mask_high));
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                    {
+                        return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                    {
+                        return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp);
+                    }
+                }
+            }
+        }
+
+        // abs
+        template <class A>
+        inline batch<float, A> abs(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            __m512 self_asf = (__m512)self;
+            __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asf);
+            __m512i res_asi = _mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF), self_asi);
+            return *reinterpret_cast<__m512*>(&res_asi);
+        }
+        template <class A>
+        inline batch<double, A> abs(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            __m512d self_asd = (__m512d)self;
+            __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asd);
+            __m512i res_asi = _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
+                                               self_asi);
+            return *reinterpret_cast<__m512d*>(&res_asi);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            if (std::is_unsigned<T>::value)
+            {
+                return self;
+            }
+
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return detail::fwd_to_avx([](__m256i s) noexcept
+                                          { return abs(batch<T, avx2>(s)); },
+                                          self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return detail::fwd_to_avx([](__m256i s) noexcept
+                                          { return abs(batch<T, avx2>(s)); },
+                                          self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_abs_epi32(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_abs_epi64(self);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                          { return add(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                          self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                          { return add(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                          self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_add_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_add_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_add_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_add_pd(self, other);
+        }
+
+        // all
+        template <class A, class T>
+        inline bool all(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return self.data == register_type(-1);
+        }
+
+        // any
+        template <class A, class T>
+        inline bool any(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return self.data != register_type(0);
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in>
+        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return self.data;
+        }
+
+        // bitwise_and
+        template <class A>
+        inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+#if defined(_MSC_VER)
+            return _mm512_and_ps(self, other);
+#else
+            return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
+#endif
+        }
+        template <class A>
+        inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_and_si512(self, other);
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data & other.data);
+        }
+
+        // bitwise_andnot
+        template <class A>
+        inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(other), _mm512_castps_si512(self)));
+        }
+        template <class A>
+        inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(other), _mm512_castpd_si512(self)));
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_andnot_si512(other, self);
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data & ~other.data);
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                __m512i tmp = _mm512_sllv_epi32(self, _mm512_set1_epi32(other));
+#else
+                __m512i tmp = _mm512_slli_epi32(self, other);
+#endif
+                return _mm512_and_si512(_mm512_set1_epi8(0xFF << other), tmp);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept
+                                          { return bitwise_lshift(batch<T, avx2>(s), o, avx2 {}); },
+                                          self, other);
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_sllv_epi32(self, _mm512_set1_epi32(other));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_sllv_epi64(self, _mm512_set1_epi64(other));
+#else
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_slli_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_slli_epi64(self, other);
+#endif
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // bitwise_not
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_xor_si512(self, _mm512_set1_epi32(-1));
+        }
+        template <class A, class T>
+        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(~self.data);
+        }
+
+        template <class A>
+        inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_set1_epi32(-1)));
+        }
+        template <class A>
+        inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_set1_epi32(-1)));
+        }
+
+        // bitwise_or
+        template <class A>
+        inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
+        }
+        template <class A>
+        inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(_mm512_or_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data | other.data);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_or_si512(self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_srav_epi32(self, _mm512_set1_epi32(other));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_srav_epi64(self, _mm512_set1_epi64(other));
+#else
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_srai_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_srai_epi64(self, other);
+#endif
+                }
+                else
+                {
+                    return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept
+                                              { return bitwise_rshift(batch<T, avx2>(s), o, avx2 {}); },
+                                              self, other);
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                    __m512i tmp = _mm512_srlv_epi32(self, _mm512_set1_epi32(other));
+#else
+                    __m512i tmp = _mm512_srli_epi32(self, other);
+#endif
+                    return _mm512_and_si512(_mm512_set1_epi8(0xFF >> other), tmp);
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_srlv_epi32(self, _mm512_set1_epi32(other));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_srlv_epi64(self, _mm512_set1_epi64(other));
+#else
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_srli_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_srli_epi64(self, other);
+#endif
+                }
+                else
+                {
+                    return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept
+                                              { return bitwise_rshift(batch<T, avx2>(s), o, avx2 {}); },
+                                              self, other);
+                }
+            }
+        }
+
+        // bitwise_xor
+        template <class A>
+        inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
+        }
+        template <class A>
+        inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data | other.data);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_xor_si512(self, other);
+        }
+
+        // bitwise_cast
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_ps(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(self);
+        }
+        template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
+        inline batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return batch<Tp, A>(self.data);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castps_pd(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castps_si512(self);
+        }
+        template <class A>
+        inline batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castpd_ps(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castpd_si512(self);
+        }
+
+        // broadcast
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> broadcast(T val, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_set1_epi8(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_set1_epi16(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_set1_epi32(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_set1_epi64(val);
+            }
+            else
+            {
+                assert(false && "unsupported");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<float, A> broadcast(float val, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_set1_ps(val);
+        }
+        template <class A>
+        batch<double, A> inline broadcast(double val, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_set1_pd(val);
+        }
+
+        // ceil
+        template <class A>
+        inline batch<float, A> ceil(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_ps(self, _MM_FROUND_TO_POS_INF);
+        }
+        template <class A>
+        inline batch<double, A> ceil(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_pd(self, _MM_FROUND_TO_POS_INF);
+        }
+
+        // convert
+        namespace detail
+        {
+            template <class A>
+            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
+            {
+                return _mm512_cvtepi32_ps(self);
+            }
+
+            template <class A>
+            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx512f>) noexcept
+            {
+                return _mm512_cvttps_epi32(self);
+            }
+
+            template <class A>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
+            {
+                return _mm512_cvtepu32_ps(self);
+            }
+
+            template <class A>
+            batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<avx512f>)
+            {
+                return _mm512_cvttps_epu32(self);
+            }
+        }
+
+        namespace detail
+        {
+            // complex_low
+            template <class A>
+            inline batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<avx512f>) noexcept
+            {
+                __m512i idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+                return _mm512_permutex2var_ps(self.real(), idx, self.imag());
+            }
+            template <class A>
+            inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx512f>) noexcept
+            {
+                __m512i idx = _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11);
+                return _mm512_permutex2var_pd(self.real(), idx, self.imag());
+            }
+
+            // complex_high
+            template <class A>
+            inline batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<avx512f>) noexcept
+            {
+                __m512i idx = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+                return _mm512_permutex2var_ps(self.real(), idx, self.imag());
+            }
+            template <class A>
+            inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx512f>) noexcept
+            {
+                __m512i idx = _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15);
+                return _mm512_permutex2var_pd(self.real(), idx, self.imag());
+            }
+        }
+
+        // div
+        template <class A>
+        inline batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_div_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_div_pd(self, other);
+        }
+
+        // eq
+        template <class A>
+        inline batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, other, _CMP_EQ_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, other, _CMP_EQ_OQ);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::compare_int_avx512f<A, T, _MM_CMPINT_EQ>(self, other);
+        }
+        template <class A, class T>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(~self.data ^ other.data);
+        }
+
+        // floor
+        template <class A>
+        inline batch<float, A> floor(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_ps(self, _MM_FROUND_TO_NEG_INF);
+        }
+        template <class A>
+        inline batch<double, A> floor(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_pd(self, _MM_FROUND_TO_NEG_INF);
+        }
+
+        // fnma
+        template <class A>
+        inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fnmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fnmadd_pd(x, y, z);
+        }
+
+        // fma
+        template <class A>
+        inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmadd_pd(x, y, z);
+        }
+
+        // fms
+        template <class A>
+        inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmsub_pd(x, y, z);
+        }
+
+        // from bool
+        template <class A, class T>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return select(self, batch<T, A>(1), batch<T, A>(0));
+        }
+
+        // from_mask
+        template <class T, class A>
+        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx512f>) noexcept
+        {
+            return static_cast<typename batch_bool<T, A>::register_type>(mask);
+        }
+
+        // gather
+        template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
+        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                  kernel::requires_arch<avx512f>) noexcept
+        {
+            return _mm512_i32gather_epi32(index, static_cast<const void*>(src), sizeof(T));
+        }
+
+        template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
+        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                  kernel::requires_arch<avx512f>) noexcept
+        {
+            return _mm512_i64gather_epi64(index, static_cast<const void*>(src), sizeof(T));
+        }
+
+        template <class A, class U, detail::enable_sized_integral_t<U, 4> = 0>
+        inline batch<float, A> gather(batch<float, A> const&, float const* src,
+                                      batch<U, A> const& index,
+                                      kernel::requires_arch<avx512f>) noexcept
+        {
+            return _mm512_i32gather_ps(index, src, sizeof(float));
+        }
+
+        template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
+        inline batch<double, A>
+        gather(batch<double, A> const&, double const* src, batch<U, A> const& index,
+               kernel::requires_arch<avx512f>) noexcept
+        {
+            return _mm512_i64gather_pd(index, src, sizeof(double));
+        }
+
+        // gather: handmade conversions
+        template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
+        inline batch<float, A> gather(batch<float, A> const&, double const* src,
+                                      batch<V, A> const& index,
+                                      requires_arch<avx512f>) noexcept
+        {
+            const batch<double, A> low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double)));
+            const batch<double, A> high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double)));
+            return detail::merge_avx(_mm512_cvtpd_ps(low.data), _mm512_cvtpd_ps(high.data));
+        }
+
+        template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
+        inline batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
+                                        batch<V, A> const& index,
+                                        requires_arch<avx512f>) noexcept
+        {
+            const batch<double, A> low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double)));
+            const batch<double, A> high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double)));
+            return detail::merge_avx(_mm512_cvtpd_epi32(low.data), _mm512_cvtpd_epi32(high.data));
+        }
+
+        // ge
+        template <class A>
+        inline batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, other, _CMP_GE_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, other, _CMP_GE_OQ);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::compare_int_avx512f<A, T, _MM_CMPINT_GE>(self, other);
+        }
+
+        // gt
+        template <class A>
+        inline batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, other, _CMP_GT_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, other, _CMP_GT_OQ);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::compare_int_avx512f<A, T, _MM_CMPINT_GT>(self, other);
+        }
+
+        // haddp
+        template <class A>
+        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512f>) noexcept
+        {
+            // The following folds over the vector once:
+            // tmp1 = [a0..8, b0..8]
+            // tmp2 = [a8..f, b8..f]
+#define XSIMD_AVX512_HADDP_STEP1(I, a, b)                                \
+    batch<float, avx512f> res##I;                                        \
+    {                                                                    \
+        auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
+        auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
+        res##I = _mm512_add_ps(tmp1, tmp2);                              \
+    }
+
+            XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
+            XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]);
+            XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]);
+            XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]);
+            XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]);
+            XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]);
+            XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]);
+            XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]);
+
+#undef XSIMD_AVX512_HADDP_STEP1
+
+            // The following flds the code and shuffles so that hadd_ps produces the correct result
+            // tmp1 = [a0..4,  a8..12,  b0..4,  b8..12] (same for tmp3)
+            // tmp2 = [a5..8, a12..16, b5..8, b12..16]  (same for tmp4)
+            // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ...
+#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d)                                                                                                         \
+    batch<float, avx2> halfx##I;                                                                                                                        \
+    {                                                                                                                                                   \
+        auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0));                                                                                \
+        auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1));                                                                                \
+                                                                                                                                                        \
+        auto resx1 = _mm512_add_ps(tmp1, tmp2);                                                                                                         \
+                                                                                                                                                        \
+        auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0));                                                                                \
+        auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1));                                                                                \
+                                                                                                                                                        \
+        auto resx2 = _mm512_add_ps(tmp3, tmp4);                                                                                                         \
+                                                                                                                                                        \
+        auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0));                                                                           \
+        auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1));                                                                           \
+                                                                                                                                                        \
+        auto resx3 = _mm512_add_ps(tmp5, tmp6);                                                                                                         \
+                                                                                                                                                        \
+        halfx##I = _mm256_hadd_ps(_mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 0)), _mm512_extractf32x4_ps(resx3, 1), 1),  \
+                                  _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 2)), _mm512_extractf32x4_ps(resx3, 3), 1)); \
+    }
+
+            XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3);
+            XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7);
+
+#undef XSIMD_AVX512_HADDP_STEP2
+
+            auto concat = _mm512_castps256_ps512(halfx0);
+            concat = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(concat), _mm256_castps_pd(halfx1), 1));
+            return concat;
+        }
+
+        template <class A>
+        inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<avx512f>) noexcept
+        {
+#define step1(I, a, b)                                                   \
+    batch<double, avx512f> res##I;                                       \
+    {                                                                    \
+        auto tmp1 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
+        auto tmp2 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
+        res##I = _mm512_add_pd(tmp1, tmp2);                              \
+    }
+
+            step1(1, row[0], row[2]);
+            step1(2, row[4], row[6]);
+            step1(3, row[1], row[3]);
+            step1(4, row[5], row[7]);
+
+#undef step1
+
+            auto tmp5 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(2, 0, 2, 0));
+            auto tmp6 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(3, 1, 3, 1));
+
+            auto resx1 = _mm512_add_pd(tmp5, tmp6);
+
+            auto tmp7 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(2, 0, 2, 0));
+            auto tmp8 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(3, 1, 3, 1));
+
+            auto resx2 = _mm512_add_pd(tmp7, tmp8);
+
+            auto tmpx = _mm512_shuffle_pd(resx1, resx2, 0b00000000);
+            auto tmpy = _mm512_shuffle_pd(resx1, resx2, 0b11111111);
+
+            return _mm512_add_pd(tmpx, tmpy);
+        }
+
+        // isnan
+        template <class A>
+        inline batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, self, _CMP_UNORD_Q);
+        }
+        template <class A>
+        inline batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, self, _CMP_UNORD_Q);
+        }
+
+        // ldexp
+        template <class A>
+        inline batch<float, A> ldexp(const batch<float, A>& self, const batch<as_integer_t<float>, A>& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_scalef_ps(self, _mm512_cvtepi32_ps(other));
+        }
+
+        template <class A>
+        inline batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512f>) noexcept
+        {
+            // FIXME: potential data loss here when converting other elements to
+            // int32 before converting them back to double.
+            __m512d adjusted_index = _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(other));
+            return _mm512_scalef_pd(self, adjusted_index);
+        }
+
+        // le
+        template <class A>
+        inline batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, other, _CMP_LE_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, other, _CMP_LE_OQ);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::compare_int_avx512f<A, T, _MM_CMPINT_LE>(self, other);
+        }
+
+        // load_aligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_load_si512((__m512i const*)mem);
+        }
+        template <class A>
+        inline batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_load_ps(mem);
+        }
+        template <class A>
+        inline batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_load_pd(mem);
+        }
+
+        // load_complex
+        namespace detail
+        {
+            template <class A>
+            inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx512f>) noexcept
+            {
+                __m512i real_idx = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+                __m512i imag_idx = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+                auto real = _mm512_permutex2var_ps(hi, real_idx, lo);
+                auto imag = _mm512_permutex2var_ps(hi, imag_idx, lo);
+                return { real, imag };
+            }
+            template <class A>
+            inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx512f>) noexcept
+            {
+                __m512i real_idx = _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14);
+                __m512i imag_idx = _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15);
+                auto real = _mm512_permutex2var_pd(hi, real_idx, lo);
+                auto imag = _mm512_permutex2var_pd(hi, imag_idx, lo);
+                return { real, imag };
+            }
+        }
+
+        // load_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_loadu_si512((__m512i const*)mem);
+        }
+        template <class A>
+        inline batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_loadu_ps(mem);
+        }
+        template <class A>
+        inline batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_loadu_pd(mem);
+        }
+
+        // lt
+        template <class A>
+        inline batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, other, _CMP_LT_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, other, _CMP_LT_OQ);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::compare_int_avx512f<A, T, _MM_CMPINT_LT>(self, other);
+        }
+
+        // mask
+        template <class A, class T>
+        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return self.data;
+        }
+
+        // max
+        template <class A>
+        inline batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_max_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_max_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_max_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_max_epi64(self, other);
+                }
+                else
+                {
+                    return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                              { return max(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                              self, other);
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_max_epu32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_max_epu64(self, other);
+                }
+                else
+                {
+                    return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                              { return max(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                              self, other);
+                }
+            }
+        }
+
+        // min
+        template <class A>
+        inline batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_min_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_min_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_min_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_min_epi64(self, other);
+                }
+                else
+                {
+                    return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                              { return min(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                              self, other);
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_min_epu32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_min_epu64(self, other);
+                }
+                else
+                {
+                    return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                              { return min(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                              self, other);
+                }
+            }
+        }
+
+        // mul
+        template <class A>
+        inline batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_mul_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_mul_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_mullo_epi32(self, other);
+            }
+            else
+            {
+                return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                          { return mul(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                          self, other);
+            }
+        }
+
+        // nearbyint
+        template <class A>
+        inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION);
+        }
+        template <class A>
+        inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION);
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                  requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cvtps_epi32(self);
+        }
+
+        // neg
+        template <class A, class T>
+        inline batch<T, A> neg(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return 0 - self;
+        }
+
+        // neq
+        template <class A>
+        inline batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, other, _CMP_NEQ_UQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, other, _CMP_NEQ_UQ);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return ~(self == other);
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data ^ other.data);
+        }
+
+        // reciprocal
+        template <class A>
+        inline batch<float, A>
+        reciprocal(batch<float, A> const& self,
+                   kernel::requires_arch<avx512f>) noexcept
+        {
+            return _mm512_rcp14_ps(self);
+        }
+
+        template <class A>
+        inline batch<double, A>
+        reciprocal(batch<double, A> const& self,
+                   kernel::requires_arch<avx512f>) noexcept
+        {
+            return _mm512_rcp14_pd(self);
+        }
+
+        // reduce_add
+        template <class A>
+        inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
+        {
+            __m128 tmp1 = _mm512_extractf32x4_ps(rhs, 0);
+            __m128 tmp2 = _mm512_extractf32x4_ps(rhs, 1);
+            __m128 tmp3 = _mm512_extractf32x4_ps(rhs, 2);
+            __m128 tmp4 = _mm512_extractf32x4_ps(rhs, 3);
+            __m128 res1 = _mm_add_ps(tmp1, tmp2);
+            __m128 res2 = _mm_add_ps(tmp3, tmp4);
+            __m128 res3 = _mm_add_ps(res1, res2);
+            return reduce_add(batch<float, sse4_2>(res3), sse4_2 {});
+        }
+        template <class A>
+        inline double reduce_add(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
+        {
+            __m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1);
+            __m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0);
+            __m256d res1 = _mm256_add_pd(tmp1, tmp2);
+            return reduce_add(batch<double, avx2>(res1), avx2 {});
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            __m256i low, high;
+            detail::split_avx512(self, low, high);
+            batch<T, avx2> blow(low), bhigh(high);
+            return reduce_add(blow, avx2 {}) + reduce_add(bhigh, avx2 {});
+        }
+
+        // reduce_max
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
+        inline T reduce_max(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            constexpr batch_constant<batch<uint64_t, A>, 5, 6, 7, 8, 0, 0, 0, 0> mask;
+            batch<T, A> step = _mm512_permutexvar_epi64((batch<uint64_t, A>)mask, self);
+            batch<T, A> acc = max(self, step);
+            __m256i low = _mm512_castsi512_si256(acc);
+            return reduce_max(batch<T, avx2>(low));
+        }
+
+        // reduce_min
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
+        inline T reduce_min(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            constexpr batch_constant<batch<uint64_t, A>, 5, 6, 7, 8, 0, 0, 0, 0> mask;
+            batch<T, A> step = _mm512_permutexvar_epi64((batch<uint64_t, A>)mask, self);
+            batch<T, A> acc = min(self, step);
+            __m256i low = _mm512_castsi512_si256(acc);
+            return reduce_min(batch<T, avx2>(low));
+        }
+
+        // rsqrt
+        template <class A>
+        inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_rsqrt14_ps(val);
+        }
+        template <class A>
+        inline batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_rsqrt14_pd(val);
+        }
+
+        // sadd
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                auto mask = other < 0;
+                auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
+                auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
+                return other + select(mask, self_neg_branch, self_pos_branch);
+            }
+            else
+            {
+                const auto diffmax = std::numeric_limits<T>::max() - self;
+                const auto mindiff = min(diffmax, other);
+                return self + mindiff;
+            }
+        }
+
+        // scatter
+        template <class A, class T,
+                  class = typename std::enable_if<std::is_same<uint32_t, T>::value || std::is_same<int32_t, T>::value, void>::type>
+        inline void scatter(batch<T, A> const& src, T* dst,
+                            batch<int32_t, A> const& index,
+                            kernel::requires_arch<avx512f>) noexcept
+        {
+            _mm512_i32scatter_epi32(dst, index, src, sizeof(T));
+        }
+
+        template <class A, class T,
+                  class = typename std::enable_if<std::is_same<uint64_t, T>::value || std::is_same<int64_t, T>::value, void>::type>
+        inline void scatter(batch<T, A> const& src, T* dst,
+                            batch<int64_t, A> const& index,
+                            kernel::requires_arch<avx512f>) noexcept
+        {
+            _mm512_i64scatter_epi64(dst, index, src, sizeof(T));
+        }
+
+        template <class A>
+        inline void scatter(batch<float, A> const& src, float* dst,
+                            batch<int32_t, A> const& index,
+                            kernel::requires_arch<avx512f>) noexcept
+        {
+            _mm512_i32scatter_ps(dst, index, src, sizeof(float));
+        }
+
+        template <class A>
+        inline void scatter(batch<double, A> const& src, double* dst,
+                            batch<int64_t, A> const& index,
+                            kernel::requires_arch<avx512f>) noexcept
+        {
+            _mm512_i64scatter_pd(dst, index, src, sizeof(double));
+        }
+
+        // select
+        template <class A>
+        inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_mask_blend_ps(cond, false_br, true_br);
+        }
+        template <class A>
+        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_mask_blend_pd(cond, false_br, true_br);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                alignas(avx2::alignment()) uint8_t buffer[64];
+                // FIXME: ultra inefficient
+                for (int i = 0; i < 64; ++i)
+                    buffer[i] = cond.data & (1ull << i) ? 0xFF : 0;
+                __m256i cond_low = batch<uint8_t, avx2>::load_aligned(&buffer[0]);
+                __m256i cond_hi = batch<uint8_t, avx2>::load_aligned(&buffer[32]);
+
+                __m256i true_low, true_hi;
+                detail::split_avx512(true_br, true_low, true_hi);
+
+                __m256i false_low, false_hi;
+                detail::split_avx512(false_br, false_low, false_hi);
+
+                __m256i res_low = select(batch_bool<T, avx2>(cond_low), batch<T, avx2>(true_low), batch<T, avx2>(false_low), avx2 {});
+                __m256i res_hi = select(batch_bool<T, avx2>(cond_hi), batch<T, avx2>(true_hi), batch<T, avx2>(false_hi), avx2 {});
+                return detail::merge_avx(res_low, res_hi);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                __m256i cond_low = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data & 0xFFFF, _mm512_set1_epi32(~0));
+                __m256i cond_hi = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data >> 16, _mm512_set1_epi32(~0));
+
+                __m256i true_low, true_hi;
+                detail::split_avx512(true_br, true_low, true_hi);
+
+                __m256i false_low, false_hi;
+                detail::split_avx512(false_br, false_low, false_hi);
+
+                __m256i res_low = select(batch_bool<T, avx2>(cond_low), batch<T, avx2>(true_low), batch<T, avx2>(false_low), avx2 {});
+                __m256i res_hi = select(batch_bool<T, avx2>(cond_hi), batch<T, avx2>(true_hi), batch<T, avx2>(false_hi), avx2 {});
+                return detail::merge_avx(res_low, res_hi);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_mask_blend_epi32(cond, false_br, true_br);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_mask_blend_epi64(cond, false_br, true_br);
+            }
+            else
+            {
+                assert(false && "unsupported arch/type combination");
+                return {};
+            }
+        }
+
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
+        {
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, avx512f {});
+        }
+
+        namespace detail
+        {
+            template <class T>
+            using enable_signed_integer_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value,
+                                                                    int>::type;
+
+            template <class T>
+            using enable_unsigned_integer_t = typename std::enable_if<std::is_integral<T>::value && std::is_unsigned<T>::value,
+                                                                      int>::type;
+        }
+
+        // set
+        template <class A>
+        inline batch<float, A> set(batch<float, A> const&, requires_arch<avx512f>, float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15) noexcept
+        {
+            return _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+        }
+
+        template <class A>
+        inline batch<double, A> set(batch<double, A> const&, requires_arch<avx512f>, double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7) noexcept
+        {
+            return _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+        {
+            return _mm512_set_epi64(v7, v6, v5, v4, v3, v2, v1, v0);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                               T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        {
+            return _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+        }
+        template <class A, class T, detail::enable_signed_integer_t<T> = 0>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                               T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                               T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
+                               T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
+        {
+#if defined(__clang__) || __GNUC__
+            return __extension__(__m512i)(__v32hi) {
+                v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+            };
+#else
+            return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                                    v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+#endif
+        }
+
+        template <class A, class T, detail::enable_unsigned_integer_t<T> = 0>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                               T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                               T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
+                               T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
+        {
+#if defined(__clang__) || __GNUC__
+            return __extension__(__m512i)(__v32hu) {
+                v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+            };
+#else
+            return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                                    v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+#endif
+        }
+
+        template <class A, class T, detail::enable_signed_integer_t<T> = 0>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                               T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                               T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
+                               T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31,
+                               T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39,
+                               T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47,
+                               T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55,
+                               T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept
+        {
+
+#if defined(__clang__) || __GNUC__
+            return __extension__(__m512i)(__v64qi) {
+                v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+                v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+                v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63
+            };
+#else
+            return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                                   v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+                                   v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+                                   v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63);
+#endif
+        }
+        template <class A, class T, detail::enable_unsigned_integer_t<T> = 0>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                               T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                               T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
+                               T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31,
+                               T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39,
+                               T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47,
+                               T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55,
+                               T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept
+        {
+
+#if defined(__clang__) || __GNUC__
+            return __extension__(__m512i)(__v64qu) {
+                v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+                v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+                v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63
+            };
+#else
+            return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                                   v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+                                   v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+                                   v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63);
+#endif
+        }
+
+        template <class A, class T, class... Values>
+        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx512f>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<T, A>::size, "consistent init");
+            using register_type = typename batch_bool<T, A>::register_type;
+            register_type r = 0;
+            unsigned shift = 0;
+            (void)std::initializer_list<register_type> { (r |= register_type(values ? 1 : 0) << (shift++))... };
+            return r;
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_left(batch<T, A> const&, requires_arch<avx512f>) noexcept
+        {
+            static_assert(N == 0xDEAD, "not implemented yet");
+            return {};
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_right(batch<T, A> const&, requires_arch<avx512f>) noexcept
+        {
+            static_assert(N == 0xDEAD, "not implemented yet");
+            return {};
+        }
+
+        // sqrt
+        template <class A>
+        inline batch<float, A> sqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_sqrt_ps(val);
+        }
+        template <class A>
+        inline batch<double, A> sqrt(batch<double, A> const& val, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_sqrt_pd(val);
+        }
+
+        // ssub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                return sadd(self, -other);
+            }
+            else
+            {
+                const auto diff = min(self, other);
+                return self - diff;
+            }
+        }
+
+        // store
+        template <class T, class A>
+        inline void store(batch_bool<T, A> const& self, bool* mem, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            constexpr auto size = batch_bool<T, A>::size;
+            for (std::size_t i = 0; i < size; ++i)
+                mem[i] = self.data & (register_type(1) << i);
+        }
+
+        // store_aligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_aligned(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_store_si512((__m512i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_store_si512((__m512i*)mem, self);
+        }
+        template <class A>
+        inline void store_aligned(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_store_ps(mem, self);
+        }
+        template <class A>
+        inline void store_aligned(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_store_pd(mem, self);
+        }
+
+        // store_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_storeu_si512((__m512i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_storeu_si512((__m512i*)mem, self);
+        }
+        template <class A>
+        inline void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_storeu_ps(mem, self);
+        }
+        template <class A>
+        inline void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_storeu_pd(mem, self);
+        }
+
+        // sub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                          { return sub(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                          self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                          { return sub(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                          self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_sub_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_sub_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_sub_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_sub_pd(self, other);
+        }
+
+        // swizzle
+        template <class A, uint32_t... Vs>
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_permutexvar_ps((batch<uint32_t, A>)mask, self);
+        }
+
+        template <class A, uint64_t... Vs>
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_permutexvar_pd((batch<uint64_t, A>)mask, self);
+        }
+
+        template <class A, uint64_t... Vs>
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_permutexvar_epi64((batch<uint64_t, A>)mask, self);
+        }
+
+        template <class A, uint64_t... Vs>
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx512f {}));
+        }
+
+        template <class A, uint32_t... Vs>
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_permutexvar_epi32((batch<uint32_t, A>)mask, self);
+        }
+
+        template <class A, uint32_t... Vs>
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx512f {}));
+        }
+
+        namespace detail
+        {
+            template <class T, class A, T... Idx>
+            struct is_pair_of_contiguous_indices;
+
+            template <class T, class A>
+            struct is_pair_of_contiguous_indices<T, A> : std::true_type
+            {
+            };
+
+            template <class T, class A, T Idx0, T Idx1, T... Idx>
+            struct is_pair_of_contiguous_indices<T, A, Idx0, Idx1, Idx...> : std::conditional<(Idx0 % 2 == 0) && (Idx0 + 1 == Idx1), is_pair_of_contiguous_indices<T, A, Idx...>, std::false_type>::type
+            {
+            };
+
+            template <class A, uint16_t I0, uint16_t I1, uint16_t I2, uint16_t I3, uint16_t I4, uint16_t I5, uint16_t I6, uint16_t I7,
+                      uint16_t I8, uint16_t I9, uint16_t I10, uint16_t I11, uint16_t I12, uint16_t I13, uint16_t I14, uint16_t I15,
+                      uint16_t I16, uint16_t I17, uint16_t I18, uint16_t I19, uint16_t I20, uint16_t I21, uint16_t I22, uint16_t I23,
+                      uint16_t I24, uint16_t I25, uint16_t I26, uint16_t I27, uint16_t I28, uint16_t I29, uint16_t I30, uint16_t I31>
+            struct fold_batch_constant
+            {
+                using type = batch_constant<batch<uint32_t, A>, I0 / 2, I2 / 2, I4 / 2, I6 / 2, I8 / 2, I10 / 2, I12 / 2, I14 / 2,
+                                            I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>;
+            };
+
+        }
+
+        template <class A, uint16_t... Idx, class _ = typename std::enable_if<detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value, void>::type>
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Idx...>, requires_arch<avx512f>) noexcept
+        {
+            constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
+            return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
+        }
+
+        template <class A>
+        inline batch<uint16_t, A>
+        swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
+        {
+            // FIXME: this sequence is very inefficient, but it's here to catch
+            // a pattern generated by detail::reduce from xsimd_generic_math.hpp.
+            // The whole pattern is actually decently folded by GCC and Clang,
+            // so bare with it.
+            constexpr batch_constant<batch<uint32_t, A>, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
+            auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
+
+            alignas(A::alignment()) uint16_t buffer[32];
+            _mm512_store_si512((__m512i*)&buffer[0], tmp);
+            buffer[0] = buffer[1];
+            return _mm512_load_si512(&buffer[0]);
+        }
+
+        template <class A, uint16_t... Vs>
+        inline batch<int16_t, A>
+        swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512f {}));
+        }
+
+        // trunc
+        template <class A>
+        inline batch<float, A>
+        trunc(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION);
+        }
+        template <class A>
+        inline batch<double, A>
+        trunc(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION);
+        }
+
+        // zip_hi
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A>
+        zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            __m512i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                assert(false && "not implemented yet");
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                assert(false && "not implemented yet");
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                lo = _mm512_unpacklo_epi32(self, other);
+                hi = _mm512_unpackhi_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                lo = _mm512_unpacklo_epi64(self, other);
+                hi = _mm512_unpackhi_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+            return _mm512_inserti32x4(
+                _mm512_inserti32x4(
+                    _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0),
+                    _mm512_extracti32x4_epi32(lo, 3),
+                    2),
+                _mm512_extracti32x4_epi32(hi, 2),
+                1);
+        }
+        template <class A>
+        inline batch<float, A>
+        zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            auto lo = _mm512_unpacklo_ps(self, other);
+            auto hi = _mm512_unpackhi_ps(self, other);
+            return _mm512_insertf32x4(
+                _mm512_insertf32x4(
+                    _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0),
+                    _mm512_extractf32x4_ps(lo, 3),
+                    2),
+                _mm512_extractf32x4_ps(hi, 2),
+                1);
+        }
+        template <class A>
+        inline batch<double, A>
+        zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other));
+            auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other));
+            return _mm512_castps_pd(_mm512_insertf32x4(
+                _mm512_insertf32x4(
+                    _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0),
+                    _mm512_extractf32x4_ps(lo, 3),
+                    2),
+                _mm512_extractf32x4_ps(hi, 2),
+                1));
+        }
+
+        // zip_lo
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A>
+        zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            __m512i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                assert(false && "not implemented yet");
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                assert(false && "not implemented yet");
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                lo = _mm512_unpacklo_epi32(self, other);
+                hi = _mm512_unpackhi_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                lo = _mm512_unpacklo_epi64(self, other);
+                hi = _mm512_unpackhi_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+            return _mm512_inserti32x4(
+                _mm512_inserti32x4(
+                    _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1),
+                    _mm512_extracti32x4_epi32(hi, 1),
+                    3),
+                _mm512_extracti32x4_epi32(lo, 1),
+                2);
+        }
+        template <class A>
+        inline batch<float, A>
+        zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            auto lo = _mm512_unpacklo_ps(self, other);
+            auto hi = _mm512_unpackhi_ps(self, other);
+            return _mm512_insertf32x4(
+                _mm512_insertf32x4(
+                    _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1),
+                    _mm512_extractf32x4_ps(hi, 1),
+                    3),
+                _mm512_extractf32x4_ps(lo, 1),
+                2);
+        }
+        template <class A>
+        inline batch<double, A>
+        zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other));
+            auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other));
+            return _mm512_castps_pd(_mm512_insertf32x4(
+                _mm512_insertf32x4(
+                    _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1),
+                    _mm512_extractf32x4_ps(hi, 1),
+                    3),
+                _mm512_extractf32x4_ps(lo, 1),
+                2));
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp
new file mode 100644
index 0000000000..1ae77e8c7d
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp
@@ -0,0 +1,384 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NUMERICAL_CONSTANT_HPP
+#define XSIMD_NUMERICAL_CONSTANT_HPP
+
+#include <limits>
+
+#include "../types/xsimd_utils.hpp"
+
+namespace xsimd
+{
+
+    namespace constants
+    {
+
+#define XSIMD_DEFINE_CONSTANT(NAME, SINGLE, DOUBLE) \
+    template <class T>                              \
+    inline T NAME() noexcept                        \
+    {                                               \
+        return T(NAME<typename T::value_type>());   \
+    }                                               \
+    template <>                                     \
+    inline float NAME<float>() noexcept             \
+    {                                               \
+        return SINGLE;                              \
+    }                                               \
+    template <>                                     \
+    inline double NAME<double>() noexcept           \
+    {                                               \
+        return DOUBLE;                              \
+    }
+
+#define XSIMD_DEFINE_CONSTANT_HEX(NAME, SINGLE, DOUBLE) \
+    template <class T>                                  \
+    inline T NAME() noexcept                            \
+    {                                                   \
+        return T(NAME<typename T::value_type>());       \
+    }                                                   \
+    template <>                                         \
+    inline float NAME<float>() noexcept                 \
+    {                                                   \
+        return bit_cast<float>((uint32_t)SINGLE);       \
+    }                                                   \
+    template <>                                         \
+    inline double NAME<double>() noexcept               \
+    {                                                   \
+        return bit_cast<double>((uint64_t)DOUBLE);      \
+    }
+
+        XSIMD_DEFINE_CONSTANT(infinity, (std::numeric_limits<float>::infinity()), (std::numeric_limits<double>::infinity()))
+        XSIMD_DEFINE_CONSTANT(invlog_2, 1.442695040888963407359924681001892137426645954152986f, 1.442695040888963407359924681001892137426645954152986)
+        XSIMD_DEFINE_CONSTANT_HEX(invlog_2hi, 0x3fb8b000, 0x3ff7154765200000)
+        XSIMD_DEFINE_CONSTANT_HEX(invlog_2lo, 0xb9389ad4, 0x3de705fc2eefa200)
+        XSIMD_DEFINE_CONSTANT(invlog10_2, 3.32192809488736234787031942949f, 3.32192809488736234787031942949)
+        XSIMD_DEFINE_CONSTANT_HEX(invpi, 0x3ea2f983, 0x3fd45f306dc9c883)
+        XSIMD_DEFINE_CONSTANT(log_2, 0.6931471805599453094172321214581765680755001343602553f, 0.6931471805599453094172321214581765680755001343602553)
+        XSIMD_DEFINE_CONSTANT_HEX(log_2hi, 0x3f318000, 0x3fe62e42fee00000)
+        XSIMD_DEFINE_CONSTANT_HEX(log_2lo, 0xb95e8083, 0x3dea39ef35793c76)
+        XSIMD_DEFINE_CONSTANT_HEX(log10_2hi, 0x3e9a0000, 0x3fd3440000000000)
+        XSIMD_DEFINE_CONSTANT_HEX(log10_2lo, 0x39826a14, 0x3ed3509f79fef312)
+        XSIMD_DEFINE_CONSTANT_HEX(logeps, 0xc17f1402, 0xc04205966f2b4f12)
+        XSIMD_DEFINE_CONSTANT_HEX(logpi, 0x3f928682, 0x3ff250d048e7a1bd)
+        XSIMD_DEFINE_CONSTANT_HEX(logsqrt2pi, 0x3f6b3f8e, 0x3fed67f1c864beb5)
+        XSIMD_DEFINE_CONSTANT(maxflint, 16777216.0f, 9007199254740992.0)
+        XSIMD_DEFINE_CONSTANT(maxlog, 88.3762626647949f, 709.78271289338400)
+        XSIMD_DEFINE_CONSTANT(maxlog2, 127.0f, 1023.)
+        XSIMD_DEFINE_CONSTANT(maxlog10, 38.23080825805664f, 308.2547155599167)
+        XSIMD_DEFINE_CONSTANT_HEX(mediumpi, 0x43490fdb, 0x412921fb54442d18)
+        XSIMD_DEFINE_CONSTANT(minlog, -88.3762626647949f, -708.3964185322641)
+        XSIMD_DEFINE_CONSTANT(minlog2, -127.0f, -1023.)
+        XSIMD_DEFINE_CONSTANT(minlog10, -37.89999771118164f, -308.2547155599167)
+        XSIMD_DEFINE_CONSTANT(minusinfinity, (-infinity<float>()), (-infinity<double>()))
+        XSIMD_DEFINE_CONSTANT(minuszero, -0.0f, -0.0)
+        XSIMD_DEFINE_CONSTANT_HEX(nan, 0xffffffff, 0xffffffffffffffff)
+        XSIMD_DEFINE_CONSTANT_HEX(oneosqrteps, 0x453504f3, 0x4190000000000000)
+        XSIMD_DEFINE_CONSTANT_HEX(oneotwoeps, 0x4a800000, 0x4320000000000000)
+        XSIMD_DEFINE_CONSTANT_HEX(pi, 0x40490fdb, 0x400921fb54442d18)
+        XSIMD_DEFINE_CONSTANT_HEX(pio_2lo, 0xb33bbd2e, 0x3c91a62633145c07)
+        XSIMD_DEFINE_CONSTANT_HEX(pio_4lo, 0xb2bbbd2e, 0x3c81a62633145c07)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2, 0x3fc90fdb, 0x3ff921fb54442d18)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_1, 0x3fc90f80, 0x3ff921fb54400000)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_1t, 0x37354443, 0x3dd0b4611a626331)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_2, 0x37354400, 0x3dd0b4611a600000)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_2t, 0x2e85a308, 0x3ba3198a2e037073)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_3, 0x2e85a300, 0x3ba3198a2e000000)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_3t, 0x248d3132, 0x397b839a252049c1)
+        XSIMD_DEFINE_CONSTANT_HEX(pio4, 0x3f490fdb, 0x3fe921fb54442d18)
+        XSIMD_DEFINE_CONSTANT_HEX(signmask, 0x80000000, 0x8000000000000000)
+        XSIMD_DEFINE_CONSTANT(smallestposval, std::numeric_limits<float>::min(), std::numeric_limits<double>::min())
+        XSIMD_DEFINE_CONSTANT_HEX(sqrt_2pi, 0x40206c99, 0x40040d931ff62704)
+        XSIMD_DEFINE_CONSTANT_HEX(sqrteps, 0x39b504f3, 0x3e50000000000000)
+        XSIMD_DEFINE_CONSTANT_HEX(tanpio8, 0x3ed413cd, 0x3fda827999fcef31)
+        XSIMD_DEFINE_CONSTANT_HEX(tan3pio8, 0x401a827a, 0x4003504f333f9de6)
+        XSIMD_DEFINE_CONSTANT_HEX(twentypi, 0x427b53d1, 0x404f6a7a2955385e)
+        XSIMD_DEFINE_CONSTANT_HEX(twoopi, 0x3f22f983, 0x3fe45f306dc9c883)
+        XSIMD_DEFINE_CONSTANT(twotonmb, 8388608.0f, 4503599627370496.0)
+        XSIMD_DEFINE_CONSTANT_HEX(twotonmbo3, 0x3ba14518, 0x3ed428a2f98d7286)
+
+#undef XSIMD_DEFINE_CONSTANT
+#undef XSIMD_DEFINE_CONSTANT_HEX
+
+        template <class T>
+        constexpr T allbits() noexcept;
+
+        template <class T>
+        constexpr as_integer_t<T> mask1frexp() noexcept;
+
+        template <class T>
+        constexpr as_integer_t<T> mask2frexp() noexcept;
+
+        template <class T>
+        constexpr as_integer_t<T> maxexponent() noexcept;
+
+        template <class T>
+        constexpr as_integer_t<T> maxexponentm1() noexcept;
+
+        template <class T>
+        constexpr int32_t nmb() noexcept;
+
+        template <class T>
+        constexpr T zero() noexcept;
+
+        template <class T>
+        constexpr T minvalue() noexcept;
+
+        template <class T>
+        constexpr T maxvalue() noexcept;
+
+        /**************************
+         * allbits implementation *
+         **************************/
+
+        namespace detail
+        {
+            template <class T, bool = std::is_integral<T>::value>
+            struct allbits_impl
+            {
+                static constexpr T get_value() noexcept
+                {
+                    return T(~0);
+                }
+            };
+
+            template <class T>
+            struct allbits_impl<T, false>
+            {
+                static constexpr T get_value() noexcept
+                {
+                    return nan<T>();
+                }
+            };
+        }
+
+        template <class T>
+        inline constexpr T allbits() noexcept
+        {
+            return T(detail::allbits_impl<typename T::value_type>::get_value());
+        }
+
+        /*****************************
+         * mask1frexp implementation *
+         *****************************/
+
+        template <class T>
+        inline constexpr as_integer_t<T> mask1frexp() noexcept
+        {
+            return as_integer_t<T>(mask1frexp<typename T::value_type>());
+        }
+
+        template <>
+        inline constexpr int32_t mask1frexp<float>() noexcept
+        {
+            return 0x7f800000;
+        }
+
+        template <>
+        inline constexpr int64_t mask1frexp<double>() noexcept
+        {
+            return 0x7ff0000000000000;
+        }
+
+        /*****************************
+         * mask2frexp implementation *
+         *****************************/
+
+        template <class T>
+        inline constexpr as_integer_t<T> mask2frexp() noexcept
+        {
+            return as_integer_t<T>(mask2frexp<typename T::value_type>());
+        }
+
+        template <>
+        inline constexpr int32_t mask2frexp<float>() noexcept
+        {
+            return 0x3f000000;
+        }
+
+        template <>
+        inline constexpr int64_t mask2frexp<double>() noexcept
+        {
+            return 0x3fe0000000000000;
+        }
+
+        /******************************
+         * maxexponent implementation *
+         ******************************/
+
+        template <class T>
+        inline constexpr as_integer_t<T> maxexponent() noexcept
+        {
+            return as_integer_t<T>(maxexponent<typename T::value_type>());
+        }
+
+        template <>
+        inline constexpr int32_t maxexponent<float>() noexcept
+        {
+            return 127;
+        }
+
+        template <>
+        inline constexpr int64_t maxexponent<double>() noexcept
+        {
+            return 1023;
+        }
+
+        /******************************
+         * maxexponent implementation *
+         ******************************/
+
+        template <class T>
+        inline constexpr as_integer_t<T> maxexponentm1() noexcept
+        {
+            return as_integer_t<T>(maxexponentm1<typename T::value_type>());
+        }
+
+        template <>
+        inline constexpr int32_t maxexponentm1<float>() noexcept
+        {
+            return 126;
+        }
+
+        template <>
+        inline constexpr int64_t maxexponentm1<double>() noexcept
+        {
+            return 1022;
+        }
+
+        /**********************
+         * nmb implementation *
+         **********************/
+
+        template <class T>
+        inline constexpr int32_t nmb() noexcept
+        {
+            return nmb<typename T::value_type>();
+        }
+
+        template <>
+        inline constexpr int32_t nmb<float>() noexcept
+        {
+            return 23;
+        }
+
+        template <>
+        inline constexpr int32_t nmb<double>() noexcept
+        {
+            return 52;
+        }
+
+        /***********************
+         * zero implementation *
+         ***********************/
+
+        template <class T>
+        inline constexpr T zero() noexcept
+        {
+            return T(typename T::value_type(0));
+        }
+
+        /***************************
+         * minvalue implementation *
+         ***************************/
+
+        namespace detail
+        {
+            template <class T>
+            struct minvalue_impl
+            {
+                static constexpr T get_value() noexcept
+                {
+                    return std::numeric_limits<typename T::value_type>::min();
+                }
+            };
+
+            template <class T>
+            struct minvalue_common
+            {
+                static constexpr T get_value() noexcept
+                {
+                    return std::numeric_limits<T>::min();
+                }
+            };
+
+            template <>
+            struct minvalue_impl<int8_t> : minvalue_common<int8_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<uint8_t> : minvalue_common<uint8_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<int16_t> : minvalue_common<int16_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<uint16_t> : minvalue_common<uint16_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<int32_t> : minvalue_common<int32_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<uint32_t> : minvalue_common<uint32_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<int64_t> : minvalue_common<int64_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<uint64_t> : minvalue_common<uint64_t>
+            {
+            };
+
+            template <>
+            struct minvalue_impl<float>
+            {
+                static float get_value() noexcept
+                {
+                    return bit_cast<float>((uint32_t)0xff7fffff);
+                }
+            };
+
+            template <>
+            struct minvalue_impl<double>
+            {
+                static double get_value() noexcept
+                {
+                    return bit_cast<double>((uint64_t)0xffefffffffffffff);
+                }
+            };
+        }
+
+        template <class T>
+        inline constexpr T minvalue() noexcept
+        {
+            return T(detail::minvalue_impl<typename T::value_type>::get_value());
+        }
+
+        /***************************
+         * maxvalue implementation *
+         ***************************/
+
+        template <class T>
+        inline constexpr T maxvalue() noexcept
+        {
+            return T(std::numeric_limits<typename T::value_type>::max());
+        }
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp
new file mode 100644
index 0000000000..64e9ed65d1
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp
@@ -0,0 +1,80 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_AVX_HPP
+#define XSIMD_FMA3_AVX_HPP
+
+#include "../types/xsimd_fma3_avx_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // fnma
+        template <class A>
+        inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fnmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fnmadd_pd(x, y, z);
+        }
+
+        // fnms
+        template <class A>
+        inline batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fnmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fnmsub_pd(x, y, z);
+        }
+
+        // fma
+        template <class A>
+        inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fmadd_pd(x, y, z);
+        }
+
+        // fms
+        template <class A>
+        inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fmsub_pd(x, y, z);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp
new file mode 100644
index 0000000000..134053951a
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp
@@ -0,0 +1,46 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_AVX2_HPP
+#define XSIMD_FMA3_AVX2_HPP
+
+#include "../types/xsimd_fma3_avx2_register.hpp"
+
+// Allow inclusion of xsimd_fma3_avx.hpp
+#ifdef XSIMD_FMA3_AVX_HPP
+#undef XSIMD_FMA3_AVX_HPP
+#define XSIMD_FORCE_FMA3_AVX_HPP
+#endif
+
+// Disallow inclusion of ./xsimd_fma3_avx_register.hpp
+#ifndef XSIMD_FMA3_AVX_REGISTER_HPP
+#define XSIMD_FMA3_AVX_REGISTER_HPP
+#define XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
+#endif
+
+// Include ./xsimd_fma3_avx.hpp but s/avx/avx2
+#define avx avx2
+#include "./xsimd_fma3_avx.hpp"
+#undef avx
+#undef XSIMD_FMA3_AVX_HPP
+
+// Carefully restore guards
+#ifdef XSIMD_FORCE_FMA3_AVX_HPP
+#define XSIMD_FMA3_AVX_HPP
+#undef XSIMD_FORCE_FMA3_AVX_HPP
+#endif
+
+#ifdef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
+#undef XSIMD_FMA3_AVX_REGISTER_HPP
+#undef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
+#endif
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp
new file mode 100644
index 0000000000..55c38f13a4
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp
@@ -0,0 +1,79 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_SSE_HPP
+#define XSIMD_FMA3_SSE_HPP
+
+#include "../types/xsimd_fma3_sse_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+        // fnma
+        template <class A>
+        inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fnmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fnmadd_pd(x, y, z);
+        }
+
+        // fnms
+        template <class A>
+        inline batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fnmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fnmsub_pd(x, y, z);
+        }
+
+        // fma
+        template <class A>
+        inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fmadd_pd(x, y, z);
+        }
+
+        // fms
+        template <class A>
+        inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fmsub_pd(x, y, z);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_fma4.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_fma4.hpp
new file mode 100644
index 0000000000..6a97d711e9
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma4.hpp
@@ -0,0 +1,79 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA4_HPP
+#define XSIMD_FMA4_HPP
+
+#include "../types/xsimd_fma4_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // fnma
+        template <class A>
+        inline batch<float, A> fnma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_nmacc_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_nmacc_pd(x, y, z);
+        }
+
+        // fnms
+        template <class A>
+        inline batch<float, A> fnms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_nmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_nmsub_pd(x, y, z);
+        }
+
+        // fma
+        template <class A>
+        inline batch<float, A> fma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_macc_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_macc_pd(x, y, z);
+        }
+
+        // fms
+        template <class A>
+        inline batch<float, A> fms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_msub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_msub_pd(x, y, z);
+        }
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_generic.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_generic.hpp
new file mode 100644
index 0000000000..6403cfb0fc
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_generic.hpp
@@ -0,0 +1,23 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_HPP
+#define XSIMD_GENERIC_HPP
+
+#include "./generic/xsimd_generic_arithmetic.hpp"
+#include "./generic/xsimd_generic_complex.hpp"
+#include "./generic/xsimd_generic_logical.hpp"
+#include "./generic/xsimd_generic_math.hpp"
+#include "./generic/xsimd_generic_memory.hpp"
+#include "./generic/xsimd_generic_rounding.hpp"
+#include "./generic/xsimd_generic_trigo.hpp"
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_generic_fwd.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_generic_fwd.hpp
new file mode 100644
index 0000000000..86e398a5ea
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_generic_fwd.hpp
@@ -0,0 +1,38 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_FWD_HPP
+#define XSIMD_GENERIC_FWD_HPP
+
+#include "../types/xsimd_batch_constant.hpp"
+
+#include <type_traits>
+
+namespace xsimd
+{
+    namespace kernel
+    {
+        // forward declaration
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<generic>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        template <class A, class T>
+        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
new file mode 100644
index 0000000000..cf0f796a1e
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
@@ -0,0 +1,86 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_ISA_HPP
+#define XSIMD_ISA_HPP
+
+#include "../config/xsimd_arch.hpp"
+
+#include "./xsimd_generic_fwd.hpp"
+
+#if XSIMD_WITH_SSE2
+#include "./xsimd_sse2.hpp"
+#endif
+
+#if XSIMD_WITH_SSE3
+#include "./xsimd_sse3.hpp"
+#endif
+
+#if XSIMD_WITH_SSSE3
+#include "./xsimd_ssse3.hpp"
+#endif
+
+#if XSIMD_WITH_SSE4_1
+#include "./xsimd_sse4_1.hpp"
+#endif
+
+#if XSIMD_WITH_SSE4_2
+#include "./xsimd_sse4_2.hpp"
+#endif
+
+#if XSIMD_WITH_FMA3_SSE
+#include "./xsimd_fma3_sse.hpp"
+#endif
+
+#if XSIMD_WITH_FMA4
+#include "./xsimd_fma4.hpp"
+#endif
+
+#if XSIMD_WITH_AVX
+#include "./xsimd_avx.hpp"
+#endif
+
+#if XSIMD_WITH_FMA3_AVX
+#include "./xsimd_fma3_avx.hpp"
+#endif
+
+#if XSIMD_WITH_AVX2
+#include "./xsimd_avx2.hpp"
+#endif
+
+#if XSIMD_WITH_FMA3_AVX2
+#include "./xsimd_fma3_avx2.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512F
+#include "./xsimd_avx512f.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512BW
+#include "./xsimd_avx512bw.hpp"
+#endif
+
+#if XSIMD_WITH_NEON
+#include "./xsimd_neon.hpp"
+#endif
+
+#if XSIMD_WITH_NEON64
+#include "./xsimd_neon64.hpp"
+#endif
+
+#if XSIMD_WITH_SVE
+#include "./xsimd_sve.hpp"
+#endif
+
+// Must come last to have access to all conversion specializations.
+#include "./xsimd_generic.hpp"
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
new file mode 100644
index 0000000000..57c662cd63
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
@@ -0,0 +1,2670 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NEON_HPP
+#define XSIMD_NEON_HPP
+
+#include <algorithm>
+#include <complex>
+#include <tuple>
+#include <type_traits>
+
+#include "../types/xsimd_neon_register.hpp"
+#include "../types/xsimd_utils.hpp"
+
+// Wrap intrinsics so we can pass them as function pointers
+// - OP: intrinsics name prefix, e.g., vorrq
+// - RT: type traits to deduce intrinsics return types
+#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                \
+    namespace wrap                                                          \
+    {                                                                       \
+        inline RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept  \
+        {                                                                   \
+            return ::OP##_u8(a, b);                                         \
+        }                                                                   \
+        inline RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept     \
+        {                                                                   \
+            return ::OP##_s8(a, b);                                         \
+        }                                                                   \
+        inline RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \
+        {                                                                   \
+            return ::OP##_u16(a, b);                                        \
+        }                                                                   \
+        inline RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept    \
+        {                                                                   \
+            return ::OP##_s16(a, b);                                        \
+        }                                                                   \
+        inline RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \
+        {                                                                   \
+            return ::OP##_u32(a, b);                                        \
+        }                                                                   \
+        inline RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept    \
+        {                                                                   \
+            return ::OP##_s32(a, b);                                        \
+        }                                                                   \
+    }
+
+#define WRAP_BINARY_INT(OP, RT)                                             \
+    WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                    \
+    namespace wrap                                                          \
+    {                                                                       \
+        inline RT<uint64x2_t> OP##_u64(uint64x2_t a, uint64x2_t b) noexcept \
+        {                                                                   \
+            return ::OP##_u64(a, b);                                        \
+        }                                                                   \
+        inline RT<int64x2_t> OP##_s64(int64x2_t a, int64x2_t b) noexcept    \
+        {                                                                   \
+            return ::OP##_s64(a, b);                                        \
+        }                                                                   \
+    }
+
+#define WRAP_BINARY_FLOAT(OP, RT)                                              \
+    namespace wrap                                                             \
+    {                                                                          \
+        inline RT<float32x4_t> OP##_f32(float32x4_t a, float32x4_t b) noexcept \
+        {                                                                      \
+            return ::OP##_f32(a, b);                                           \
+        }                                                                      \
+    }
+
+#define WRAP_UNARY_INT_EXCLUDING_64(OP)                   \
+    namespace wrap                                        \
+    {                                                     \
+        inline uint8x16_t OP##_u8(uint8x16_t a) noexcept  \
+        {                                                 \
+            return ::OP##_u8(a);                          \
+        }                                                 \
+        inline int8x16_t OP##_s8(int8x16_t a) noexcept    \
+        {                                                 \
+            return ::OP##_s8(a);                          \
+        }                                                 \
+        inline uint16x8_t OP##_u16(uint16x8_t a) noexcept \
+        {                                                 \
+            return ::OP##_u16(a);                         \
+        }                                                 \
+        inline int16x8_t OP##_s16(int16x8_t a) noexcept   \
+        {                                                 \
+            return ::OP##_s16(a);                         \
+        }                                                 \
+        inline uint32x4_t OP##_u32(uint32x4_t a) noexcept \
+        {                                                 \
+            return ::OP##_u32(a);                         \
+        }                                                 \
+        inline int32x4_t OP##_s32(int32x4_t a) noexcept   \
+        {                                                 \
+            return ::OP##_s32(a);                         \
+        }                                                 \
+    }
+
+#define WRAP_UNARY_INT(OP)                                \
+    WRAP_UNARY_INT_EXCLUDING_64(OP)                       \
+    namespace wrap                                        \
+    {                                                     \
+        inline uint64x2_t OP##_u64(uint64x2_t a) noexcept \
+        {                                                 \
+            return ::OP##_u64(a);                         \
+        }                                                 \
+        inline int64x2_t OP##_s64(int64x2_t a) noexcept   \
+        {                                                 \
+            return ::OP##_s64(a);                         \
+        }                                                 \
+    }
+
+#define WRAP_UNARY_FLOAT(OP)                                \
+    namespace wrap                                          \
+    {                                                       \
+        inline float32x4_t OP##_f32(float32x4_t a) noexcept \
+        {                                                   \
+            return ::OP##_f32(a);                           \
+        }                                                   \
+    }
+
+// Dummy identity caster to ease coding
+inline uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; }
+inline int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; }
+inline uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; }
+inline int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; }
+inline uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; }
+inline int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; }
+inline uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; }
+inline int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; }
+inline float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; }
+
+namespace xsimd
+{
+    template <class batch_type, bool... Values>
+    struct batch_bool_constant;
+
+    namespace kernel
+    {
+        using namespace types;
+
+        namespace detail
+        {
+            template <template <class> class return_type, class... T>
+            struct neon_dispatcher_base
+            {
+                struct unary
+                {
+                    using container_type = std::tuple<return_type<T> (*)(T)...>;
+                    const container_type m_func;
+
+                    template <class U>
+                    return_type<U> apply(U rhs) const noexcept
+                    {
+                        using func_type = return_type<U> (*)(U);
+                        auto func = xsimd::detail::get<func_type>(m_func);
+                        return func(rhs);
+                    }
+                };
+
+                struct binary
+                {
+                    using container_type = std::tuple<return_type<T> (*)(T, T)...>;
+                    const container_type m_func;
+
+                    template <class U>
+                    return_type<U> apply(U lhs, U rhs) const noexcept
+                    {
+                        using func_type = return_type<U> (*)(U, U);
+                        auto func = xsimd::detail::get<func_type>(m_func);
+                        return func(lhs, rhs);
+                    }
+                };
+            };
+
+            /***************************
+             *  arithmetic dispatchers *
+             ***************************/
+
+            template <class T>
+            using identity_return_type = T;
+
+            template <class... T>
+            struct neon_dispatcher_impl : neon_dispatcher_base<identity_return_type, T...>
+            {
+            };
+
+            using neon_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                         uint16x8_t, int16x8_t,
+                                                         uint32x4_t, int32x4_t,
+                                                         uint64x2_t, int64x2_t,
+                                                         float32x4_t>;
+
+            using excluding_int64_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                                    uint16x8_t, int16x8_t,
+                                                                    uint32x4_t, int32x4_t,
+                                                                    float32x4_t>;
+
+            /**************************
+             * comparison dispatchers *
+             **************************/
+
+            template <class T>
+            struct comp_return_type_impl;
+
+            template <>
+            struct comp_return_type_impl<uint8x16_t>
+            {
+                using type = uint8x16_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<int8x16_t>
+            {
+                using type = uint8x16_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<uint16x8_t>
+            {
+                using type = uint16x8_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<int16x8_t>
+            {
+                using type = uint16x8_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<uint32x4_t>
+            {
+                using type = uint32x4_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<int32x4_t>
+            {
+                using type = uint32x4_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<uint64x2_t>
+            {
+                using type = uint64x2_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<int64x2_t>
+            {
+                using type = uint64x2_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<float32x4_t>
+            {
+                using type = uint32x4_t;
+            };
+
+            template <class T>
+            using comp_return_type = typename comp_return_type_impl<T>::type;
+
+            template <class... T>
+            struct neon_comp_dispatcher_impl : neon_dispatcher_base<comp_return_type, T...>
+            {
+            };
+
+            using excluding_int64_comp_dispatcher = neon_comp_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                                              uint16x8_t, int16x8_t,
+                                                                              uint32x4_t, int32x4_t,
+                                                                              float32x4_t>;
+
+            /**************************************
+             * enabling / disabling metafunctions *
+             **************************************/
+
+            template <class T>
+            using enable_neon_type_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value,
+                                                               int>::type;
+
+            template <class T>
+            using exclude_int64_neon_t
+                = typename std::enable_if<(std::is_integral<T>::value && sizeof(T) != 8) || std::is_same<T, float>::value, int>::type;
+        }
+
+        /*************
+         * broadcast *
+         *************/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_u8(uint8_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_s8(int8_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_u16(uint16_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_s16(int16_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_u32(uint32_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_s32(int32_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_u64(uint64_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_s64(int64_t(val));
+        }
+
+        template <class A>
+        inline batch<float, A> broadcast(float val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_f32(val);
+        }
+
+        /*******
+         * set *
+         *******/
+
+        template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<neon>, Args... args) noexcept
+        {
+            return xsimd::types::detail::neon_vector_type<T> { args... };
+        }
+
+        template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
+        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<neon>, Args... args) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            using unsigned_type = as_unsigned_integer_t<T>;
+            return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
+        }
+
+        template <class A>
+        inline batch<float, A> set(batch<float, A> const&, requires_arch<neon>, float f0, float f1, float f2, float f3) noexcept
+        {
+            return float32x4_t { f0, f1, f2, f3 };
+        }
+
+        template <class A>
+        inline batch<std::complex<float>, A> set(batch<std::complex<float>, A> const&, requires_arch<neon>,
+                                                 std::complex<float> c0, std::complex<float> c1,
+                                                 std::complex<float> c2, std::complex<float> c3) noexcept
+        {
+            return batch<std::complex<float>>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() },
+                                              float32x4_t { c0.imag(), c1.imag(), c2.imag(), c3.imag() });
+        }
+
+        template <class A, class... Args>
+        inline batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<neon>, Args... args) noexcept
+        {
+            using register_type = typename batch_bool<float, A>::register_type;
+            using unsigned_type = as_unsigned_integer_t<float>;
+            return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
+        }
+
+        /*************
+         * from_bool *
+         *************/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_u8(arg, vdupq_n_u8(1));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_s8(reinterpret_cast<int8x16_t>(arg.data), vdupq_n_s8(1));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_u16(arg, vdupq_n_u16(1));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_s16(reinterpret_cast<int16x8_t>(arg.data), vdupq_n_s16(1));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_u32(arg, vdupq_n_u32(1));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_s32(reinterpret_cast<int32x4_t>(arg.data), vdupq_n_s32(1));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_u64(arg, vdupq_n_u64(1));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_s64(reinterpret_cast<int64x2_t>(arg.data), vdupq_n_s64(1));
+        }
+
+        template <class A>
+        inline batch<float, A> from_bool(batch_bool<float, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vreinterpretq_f32_u32(vandq_u32(arg, vreinterpretq_u32_f32(vdupq_n_f32(1.f))));
+        }
+
+        /********
+         * load *
+         ********/
+
+        // It is not possible to use a call to A::alignment() here, so use an
+        // immediate instead.
+#if defined(__clang__) || defined(__GNUC__)
+#define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
+#elif defined(_MSC_VER)
+#define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
+#else
+#define xsimd_aligned_load(inst, type, expr) inst((type)expr)
+#endif
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_u8, uint8_t*, src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_s8, int8_t*, src);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_u16, uint16_t*, src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_s16, int16_t*, src);
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_u32, uint32_t*, src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_s32, int32_t*, src);
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_u64, uint64_t*, src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_s64, int64_t*, src);
+        }
+
+        template <class A>
+        inline batch<float, A> load_aligned(float const* src, convert<float>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_f32, float*, src);
+        }
+
+#undef xsimd_aligned_load
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_u8((uint8_t*)src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_s8((int8_t*)src);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_u16((uint16_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_s16((int16_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_u32((uint32_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_s32((int32_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_u64((uint64_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_s64((int64_t*)src);
+        }
+
+        template <class A>
+        inline batch<float, A> load_unaligned(float const* src, convert<float>, requires_arch<neon>) noexcept
+        {
+            return vld1q_f32(src);
+        }
+
+        /*********
+         * store *
+         *********/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_u8((uint8_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_s8((int8_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_u16((uint16_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_s16((int16_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_u32((uint32_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_s32((int32_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_u64((uint64_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_s64((int64_t*)dst, src);
+        }
+
+        template <class A>
+        inline void store_aligned(float* dst, batch<float, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_f32(dst, src);
+        }
+
+        template <class A, class T>
+        inline void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            store_aligned<A>(dst, src, A {});
+        }
+
+        /****************
+         * load_complex *
+         ****************/
+
+        template <class A>
+        inline batch<std::complex<float>, A> load_complex_aligned(std::complex<float> const* mem, convert<std::complex<float>>, requires_arch<neon>) noexcept
+        {
+            using real_batch = batch<float, A>;
+            const float* buf = reinterpret_cast<const float*>(mem);
+            float32x4x2_t tmp = vld2q_f32(buf);
+            real_batch real = tmp.val[0],
+                       imag = tmp.val[1];
+            return batch<std::complex<float>, A> { real, imag };
+        }
+
+        template <class A>
+        inline batch<std::complex<float>, A> load_complex_unaligned(std::complex<float> const* mem, convert<std::complex<float>> cvt, requires_arch<neon>) noexcept
+        {
+            return load_complex_aligned<A>(mem, cvt, A {});
+        }
+
+        /*****************
+         * store_complex *
+         *****************/
+
+        template <class A>
+        inline void store_complex_aligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
+        {
+            float32x4x2_t tmp;
+            tmp.val[0] = src.real();
+            tmp.val[1] = src.imag();
+            float* buf = reinterpret_cast<float*>(dst);
+            vst2q_f32(buf, tmp);
+        }
+
+        template <class A>
+        inline void store_complex_unaligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
+        {
+            store_complex_aligned(dst, src, A {});
+        }
+
+        /*******
+         * neg *
+         *******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(rhs)));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vnegq_s8(rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vreinterpretq_u16_s16(vnegq_s16(vreinterpretq_s16_u16(rhs)));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vnegq_s16(rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(rhs)));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vnegq_s32(rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch<T, A> { -rhs.get(0), -rhs.get(1) };
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch<T, A> { -rhs.get(0), -rhs.get(1) };
+        }
+
+        template <class A>
+        inline batch<float, A> neg(batch<float, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vnegq_f32(rhs);
+        }
+
+        /*******
+         * add *
+         *******/
+
+        WRAP_BINARY_INT(vaddq, detail::identity_return_type)
+        WRAP_BINARY_FLOAT(vaddq, detail::identity_return_type)
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vaddq_u8, wrap::vaddq_s8, wrap::vaddq_u16, wrap::vaddq_s16,
+                                wrap::vaddq_u32, wrap::vaddq_s32, wrap::vaddq_u64, wrap::vaddq_s64,
+                                wrap::vaddq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        /********
+         * sadd *
+         ********/
+
+        WRAP_BINARY_INT(vqaddq, detail::identity_return_type)
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vqaddq_u8, wrap::vqaddq_s8, wrap::vqaddq_u16, wrap::vqaddq_s16,
+                                wrap::vqaddq_u32, wrap::vqaddq_s32, wrap::vqaddq_u64, wrap::vqaddq_s64,
+                                wrap::vaddq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        /*******
+         * sub *
+         *******/
+
+        WRAP_BINARY_INT(vsubq, detail::identity_return_type)
+        WRAP_BINARY_FLOAT(vsubq, detail::identity_return_type)
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vsubq_u8, wrap::vsubq_s8, wrap::vsubq_u16, wrap::vsubq_s16,
+                                wrap::vsubq_u32, wrap::vsubq_s32, wrap::vsubq_u64, wrap::vsubq_s64,
+                                wrap::vsubq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        /********
+         * ssub *
+         ********/
+
+        WRAP_BINARY_INT(vqsubq, detail::identity_return_type)
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vqsubq_u8, wrap::vqsubq_s8, wrap::vqsubq_u16, wrap::vqsubq_s16,
+                                wrap::vqsubq_u32, wrap::vqsubq_s32, wrap::vqsubq_u64, wrap::vqsubq_s64,
+                                wrap::vsubq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        /*******
+         * mul *
+         *******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vmulq, detail::identity_return_type)
+        WRAP_BINARY_FLOAT(vmulq, detail::identity_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vmulq_u8, wrap::vmulq_s8, wrap::vmulq_u16, wrap::vmulq_s16,
+                                wrap::vmulq_u32, wrap::vmulq_s32, wrap::vmulq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        /*******
+         * div *
+         *******/
+
+#if defined(XSIMD_FAST_INTEGER_DIVISION)
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vcvtq_s32_f32(vcvtq_f32_s32(lhs) / vcvtq_f32_s32(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vcvtq_u32_f32(vcvtq_f32_u32(lhs) / vcvtq_f32_u32(rhs));
+        }
+#endif
+
+        template <class A>
+        inline batch<float, A> div(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            // from stackoverflow & https://projectne10.github.io/Ne10/doc/NE10__divc_8neon_8c_source.html
+            // get an initial estimate of 1/b.
+            float32x4_t rcp = reciprocal(rhs);
+
+            // use a couple Newton-Raphson steps to refine the estimate.  Depending on your
+            // application's accuracy requirements, you may be able to get away with only
+            // one refinement (instead of the two used here).  Be sure to test!
+            rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp);
+            rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp);
+
+            // and finally, compute a / b = a * (1 / b)
+            return vmulq_f32(lhs, rcp);
+        }
+
+        /******
+         * eq *
+         ******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vceqq, detail::comp_return_type)
+        WRAP_BINARY_FLOAT(vceqq, detail::comp_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vceqq_u8, wrap::vceqq_s8, wrap::vceqq_u16, wrap::vceqq_s16,
+                                wrap::vceqq_u32, wrap::vceqq_s32, wrap::vceqq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            using dispatcher_type = detail::neon_comp_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary;
+            const dispatcher_type dispatcher = {
+                std::make_tuple(wrap::vceqq_u8, wrap::vceqq_u16, wrap::vceqq_u32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
+        }
+
+        /*************
+         * fast_cast *
+         *************/
+
+        namespace detail
+        {
+            template <class A>
+            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
+            {
+                return vcvtq_f32_s32(self);
+            }
+
+            template <class A>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
+            {
+                return vcvtq_f32_u32(self);
+            }
+
+            template <class A>
+            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<neon>) noexcept
+            {
+                return vcvtq_s32_f32(self);
+            }
+
+            template <class A>
+            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<neon>) noexcept
+            {
+                return vcvtq_u32_f32(self);
+            }
+
+        }
+
+        /******
+         * lt *
+         ******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vcltq, detail::comp_return_type)
+        WRAP_BINARY_FLOAT(vcltq, detail::comp_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vcltq_u8, wrap::vcltq_s8, wrap::vcltq_u16, wrap::vcltq_s16,
+                                wrap::vcltq_u32, wrap::vcltq_s32, wrap::vcltq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch_bool<T, A>({ lhs.get(0) < rhs.get(0), lhs.get(1) < rhs.get(1) });
+        }
+
+        /******
+         * le *
+         ******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vcleq, detail::comp_return_type)
+        WRAP_BINARY_FLOAT(vcleq, detail::comp_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vcleq_u8, wrap::vcleq_s8, wrap::vcleq_u16, wrap::vcleq_s16,
+                                wrap::vcleq_u32, wrap::vcleq_s32, wrap::vcleq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch_bool<T, A>({ lhs.get(0) <= rhs.get(0), lhs.get(1) <= rhs.get(1) });
+        }
+
+        /******
+         * gt *
+         ******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type)
+        WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vcgtq_u8, wrap::vcgtq_s8, wrap::vcgtq_u16, wrap::vcgtq_s16,
+                                wrap::vcgtq_u32, wrap::vcgtq_s32, wrap::vcgtq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch_bool<T, A>({ lhs.get(0) > rhs.get(0), lhs.get(1) > rhs.get(1) });
+        }
+
+        /******
+         * ge *
+         ******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vcgeq, detail::comp_return_type)
+        WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vcgeq_u8, wrap::vcgeq_s8, wrap::vcgeq_u16, wrap::vcgeq_s16,
+                                wrap::vcgeq_u32, wrap::vcgeq_s32, wrap::vcgeq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch_bool<T, A>({ lhs.get(0) >= rhs.get(0), lhs.get(1) >= rhs.get(1) });
+        }
+
+        /*******************
+         * batch_bool_cast *
+         *******************/
+
+        template <class A, class T_out, class T_in>
+        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T_out, A>::register_type;
+            return register_type(self);
+        }
+
+        /***************
+         * bitwise_and *
+         ***************/
+
+        WRAP_BINARY_INT(vandq, detail::identity_return_type)
+
+        namespace detail
+        {
+            inline float32x4_t bitwise_and_f32(float32x4_t lhs, float32x4_t rhs) noexcept
+            {
+                return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(lhs),
+                                                       vreinterpretq_u32_f32(rhs)));
+            }
+
+            template <class V>
+            V bitwise_and_neon(V const& lhs, V const& rhs)
+            {
+                const neon_dispatcher::binary dispatcher = {
+                    std::make_tuple(wrap::vandq_u8, wrap::vandq_s8, wrap::vandq_u16, wrap::vandq_s16,
+                                    wrap::vandq_u32, wrap::vandq_s32, wrap::vandq_u64, wrap::vandq_s64,
+                                    bitwise_and_f32)
+                };
+                return dispatcher.apply(lhs, rhs);
+            }
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
+        }
+
+        /**************
+         * bitwise_or *
+         **************/
+
+        WRAP_BINARY_INT(vorrq, detail::identity_return_type)
+
+        namespace detail
+        {
+            inline float32x4_t bitwise_or_f32(float32x4_t lhs, float32x4_t rhs) noexcept
+            {
+                return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(lhs),
+                                                       vreinterpretq_u32_f32(rhs)));
+            }
+
+            template <class V>
+            inline V bitwise_or_neon(V const& lhs, V const& rhs) noexcept
+            {
+                const neon_dispatcher::binary dispatcher = {
+                    std::make_tuple(wrap::vorrq_u8, wrap::vorrq_s8, wrap::vorrq_u16, wrap::vorrq_s16,
+                                    wrap::vorrq_u32, wrap::vorrq_s32, wrap::vorrq_u64, wrap::vorrq_s64,
+                                    bitwise_or_f32)
+                };
+                return dispatcher.apply(lhs, rhs);
+            }
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
+        }
+
+        /***************
+         * bitwise_xor *
+         ***************/
+
+        WRAP_BINARY_INT(veorq, detail::identity_return_type)
+
+        namespace detail
+        {
+            inline float32x4_t bitwise_xor_f32(float32x4_t lhs, float32x4_t rhs) noexcept
+            {
+                return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(lhs),
+                                                       vreinterpretq_u32_f32(rhs)));
+            }
+
+            template <class V>
+            inline V bitwise_xor_neon(V const& lhs, V const& rhs) noexcept
+            {
+                const neon_dispatcher::binary dispatcher = {
+                    std::make_tuple(wrap::veorq_u8, wrap::veorq_s8, wrap::veorq_u16, wrap::veorq_s16,
+                                    wrap::veorq_u32, wrap::veorq_s32, wrap::veorq_u64, wrap::veorq_s64,
+                                    bitwise_xor_f32)
+                };
+                return dispatcher.apply(lhs, rhs);
+            }
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
+        }
+
+        /*******
+         * neq *
+         *******/
+
+        template <class A, class T>
+        inline batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return bitwise_xor(lhs, rhs, A {});
+        }
+
+        /***************
+         * bitwise_not *
+         ***************/
+
+        WRAP_UNARY_INT_EXCLUDING_64(vmvnq)
+
+        namespace detail
+        {
+            inline int64x2_t bitwise_not_s64(int64x2_t arg) noexcept
+            {
+                return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(arg)));
+            }
+
+            inline uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept
+            {
+                return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(arg)));
+            }
+
+            inline float32x4_t bitwise_not_f32(float32x4_t arg) noexcept
+            {
+                return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg)));
+            }
+
+            template <class V>
+            inline V bitwise_not_neon(V const& arg) noexcept
+            {
+                const neon_dispatcher::unary dispatcher = {
+                    std::make_tuple(wrap::vmvnq_u8, wrap::vmvnq_s8, wrap::vmvnq_u16, wrap::vmvnq_s16,
+                                    wrap::vmvnq_u32, wrap::vmvnq_s32,
+                                    bitwise_not_u64, bitwise_not_s64,
+                                    bitwise_not_f32)
+                };
+                return dispatcher.apply(arg);
+            }
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            return detail::bitwise_not_neon(register_type(arg));
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return detail::bitwise_not_neon(register_type(arg));
+        }
+
+        /******************
+         * bitwise_andnot *
+         ******************/
+
+        WRAP_BINARY_INT(vbicq, detail::identity_return_type)
+
+        namespace detail
+        {
+            inline float32x4_t bitwise_andnot_f32(float32x4_t lhs, float32x4_t rhs) noexcept
+            {
+                return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs)));
+            }
+
+            template <class V>
+            inline V bitwise_andnot_neon(V const& lhs, V const& rhs) noexcept
+            {
+                const detail::neon_dispatcher::binary dispatcher = {
+                    std::make_tuple(wrap::vbicq_u8, wrap::vbicq_s8, wrap::vbicq_u16, wrap::vbicq_s16,
+                                    wrap::vbicq_u32, wrap::vbicq_s32, wrap::vbicq_u64, wrap::vbicq_s64,
+                                    bitwise_andnot_f32)
+                };
+                return dispatcher.apply(lhs, rhs);
+            }
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
+        }
+
+        /*******
+         * min *
+         *******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vminq, detail::identity_return_type)
+        WRAP_BINARY_FLOAT(vminq, detail::identity_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vminq_u8, wrap::vminq_s8, wrap::vminq_u16, wrap::vminq_s16,
+                                wrap::vminq_u32, wrap::vminq_s32, wrap::vminq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return { std::min(lhs.get(0), rhs.get(0)), std::min(lhs.get(1), rhs.get(1)) };
+        }
+
+        /*******
+         * max *
+         *******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vmaxq, detail::identity_return_type)
+        WRAP_BINARY_FLOAT(vmaxq, detail::identity_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vmaxq_u8, wrap::vmaxq_s8, wrap::vmaxq_u16, wrap::vmaxq_s16,
+                                wrap::vmaxq_u32, wrap::vmaxq_s32, wrap::vmaxq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return { std::max(lhs.get(0), rhs.get(0)), std::max(lhs.get(1), rhs.get(1)) };
+        }
+
+        /*******
+         * abs *
+         *******/
+
+        namespace wrap
+        {
+            inline int8x16_t vabsq_s8(int8x16_t a) noexcept { return ::vabsq_s8(a); }
+            inline int16x8_t vabsq_s16(int16x8_t a) noexcept { return ::vabsq_s16(a); }
+            inline int32x4_t vabsq_s32(int32x4_t a) noexcept { return ::vabsq_s32(a); }
+        }
+        WRAP_UNARY_FLOAT(vabsq)
+
+        namespace detail
+        {
+            inline uint8x16_t abs_u8(uint8x16_t arg) noexcept
+            {
+                return arg;
+            }
+
+            inline uint16x8_t abs_u16(uint16x8_t arg) noexcept
+            {
+                return arg;
+            }
+
+            inline uint32x4_t abs_u32(uint32x4_t arg) noexcept
+            {
+                return arg;
+            }
+        }
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch<T, A> abs(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_dispatcher::unary dispatcher = {
+                std::make_tuple(detail::abs_u8, wrap::vabsq_s8, detail::abs_u16, wrap::vabsq_s16,
+                                detail::abs_u32, wrap::vabsq_s32, wrap::vabsq_f32)
+            };
+            return dispatcher.apply(register_type(arg));
+        }
+
+        /********
+         * rsqrt *
+         ********/
+
+        template <class A>
+        inline batch<float, A> rsqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vrsqrteq_f32(arg);
+        }
+
+        /********
+         * sqrt *
+         ********/
+
+        template <class A>
+        inline batch<float, A> sqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
+        {
+            batch<float, A> sqrt_reciprocal = vrsqrteq_f32(arg);
+            // one iter
+            sqrt_reciprocal = sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal));
+            batch<float, A> sqrt_approx = arg * sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal));
+            batch<float, A> zero(0.f);
+            return select(arg == zero, zero, sqrt_approx);
+        }
+
+        /********************
+         * Fused operations *
+         ********************/
+
+#ifdef __ARM_FEATURE_FMA
+        template <class A>
+        inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
+        {
+            return vfmaq_f32(z, x, y);
+        }
+
+        template <class A>
+        inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
+        {
+            return vfmaq_f32(-z, x, y);
+        }
+#endif
+
+        /*********
+         * haddp *
+         *********/
+
+        template <class A>
+        inline batch<float, A> haddp(const batch<float, A>* row, requires_arch<neon>) noexcept
+        {
+            // row = (a,b,c,d)
+            float32x2_t tmp1, tmp2, tmp3;
+            // tmp1 = (a0 + a2, a1 + a3)
+            tmp1 = vpadd_f32(vget_low_f32(row[0]), vget_high_f32(row[0]));
+            // tmp2 = (b0 + b2, b1 + b3)
+            tmp2 = vpadd_f32(vget_low_f32(row[1]), vget_high_f32(row[1]));
+            // tmp1 = (a0..3, b0..3)
+            tmp1 = vpadd_f32(tmp1, tmp2);
+            // tmp2 = (c0 + c2, c1 + c3)
+            tmp2 = vpadd_f32(vget_low_f32(row[2]), vget_high_f32(row[2]));
+            // tmp3 = (d0 + d2, d1 + d3)
+            tmp3 = vpadd_f32(vget_low_f32(row[3]), vget_high_f32(row[3]));
+            // tmp1 = (c0..3, d0..3)
+            tmp2 = vpadd_f32(tmp2, tmp3);
+            // return = (a0..3, b0..3, c0..3, d0..3)
+            return vcombine_f32(tmp1, tmp2);
+        }
+
+        /**************
+         * reciprocal *
+         **************/
+
+        template <class A>
+        inline batch<float, A>
+        reciprocal(const batch<float, A>& x,
+                   kernel::requires_arch<neon>) noexcept
+        {
+            return vrecpeq_f32(x);
+        }
+
+        /**********
+         * insert *
+         **********/
+
+        template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_u8(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_s8(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_u16(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<int16_t, A> insert(batch<int16_t, A> const& self, int16_t val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_s16(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_u32(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_s32(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_u64(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_s64(val, self, I);
+        }
+
+        template <class A, size_t I>
+        inline batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_f32(val, self, I);
+        }
+
+        /********************
+         * nearbyint_as_int *
+         *******************/
+
+        template <class A>
+        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                  requires_arch<neon>) noexcept
+        {
+            /* origin: https://github.com/DLTcollab/sse2neon/blob/cad518a93b326f0f644b7972d488d04eaa2b0475/sse2neon.h#L4028-L4047 */
+            //  Contributors to this work are:
+            //   John W. Ratcliff <jratcliffscarab@gmail.com>
+            //   Brandon Rowlett <browlett@nvidia.com>
+            //   Ken Fast <kfast@gdeb.com>
+            //   Eric van Beurden <evanbeurden@nvidia.com>
+            //   Alexander Potylitsin <apotylitsin@nvidia.com>
+            //   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+            //   Jim Huang <jserv@biilabs.io>
+            //   Mark Cheng <marktwtn@biilabs.io>
+            //   Malcolm James MacLeod <malcolm@gulden.com>
+            //   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+            //   Sebastian Pop <spop@amazon.com>
+            //   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+            //   Danila Kutenin <danilak@google.com>
+            //   François Turban (JishinMaster) <francois.turban@gmail.com>
+            //   Pei-Hsuan Hung <afcidk@gmail.com>
+            //   Yang-Hao Yuan <yanghau@biilabs.io>
+            //   Syoyo Fujita <syoyo@lighttransport.com>
+            //   Brecht Van Lommel <brecht@blender.org>
+
+            /*
+             * sse2neon is freely redistributable under the MIT License.
+             *
+             * Permission is hereby granted, free of charge, to any person obtaining a copy
+             * of this software and associated documentation files (the "Software"), to deal
+             * in the Software without restriction, including without limitation the rights
+             * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+             * copies of the Software, and to permit persons to whom the Software is
+             * furnished to do so, subject to the following conditions:
+             *
+             * The above copyright notice and this permission notice shall be included in
+             * all copies or substantial portions of the Software.
+             *
+             * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+             * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+             * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+             * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+             * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+             * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+             * SOFTWARE.
+             */
+
+            const auto signmask = vdupq_n_u32(0x80000000);
+            const auto half = vbslq_f32(signmask, self,
+                                        vdupq_n_f32(0.5f)); /* +/- 0.5 */
+            const auto r_normal = vcvtq_s32_f32(vaddq_f32(
+                self, half)); /* round to integer: [a + 0.5]*/
+            const auto r_trunc = vcvtq_s32_f32(self); /* truncate to integer: [a] */
+            const auto plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+                vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+            const auto r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                          vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+            const auto delta = vsubq_f32(
+                self,
+                vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+            const auto is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
+            return vbslq_s32(is_delta_half, r_even, r_normal);
+        }
+
+        /**************
+         * reduce_add *
+         **************/
+
+        namespace detail
+        {
+            template <class T, class A, class V>
+            inline T sum_batch(V const& arg) noexcept
+            {
+                T res = T(0);
+                for (std::size_t i = 0; i < batch<T, A>::size; ++i)
+                {
+                    res += arg[i];
+                }
+                return res;
+            }
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            uint8x8_t tmp = vpadd_u8(vget_low_u8(arg), vget_high_u8(arg));
+            tmp = vpadd_u8(tmp, tmp);
+            tmp = vpadd_u8(tmp, tmp);
+            tmp = vpadd_u8(tmp, tmp);
+            return vget_lane_u8(tmp, 0);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            int8x8_t tmp = vpadd_s8(vget_low_s8(arg), vget_high_s8(arg));
+            tmp = vpadd_s8(tmp, tmp);
+            tmp = vpadd_s8(tmp, tmp);
+            tmp = vpadd_s8(tmp, tmp);
+            return vget_lane_s8(tmp, 0);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            uint16x4_t tmp = vpadd_u16(vget_low_u16(arg), vget_high_u16(arg));
+            tmp = vpadd_u16(tmp, tmp);
+            tmp = vpadd_u16(tmp, tmp);
+            return vget_lane_u16(tmp, 0);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            int16x4_t tmp = vpadd_s16(vget_low_s16(arg), vget_high_s16(arg));
+            tmp = vpadd_s16(tmp, tmp);
+            tmp = vpadd_s16(tmp, tmp);
+            return vget_lane_s16(tmp, 0);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            uint32x2_t tmp = vpadd_u32(vget_low_u32(arg), vget_high_u32(arg));
+            tmp = vpadd_u32(tmp, tmp);
+            return vget_lane_u32(tmp, 0);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            int32x2_t tmp = vpadd_s32(vget_low_s32(arg), vget_high_s32(arg));
+            tmp = vpadd_s32(tmp, tmp);
+            return vget_lane_s32(tmp, 0);
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return arg.get(0) + arg.get(1);
+        }
+
+        template <class A>
+        inline float reduce_add(batch<float, A> const& arg, requires_arch<neon>) noexcept
+        {
+            float32x2_t tmp = vpadd_f32(vget_low_f32(arg), vget_high_f32(arg));
+            tmp = vpadd_f32(tmp, tmp);
+            return vget_lane_f32(tmp, 0);
+        }
+
+        /**************
+         * reduce_max *
+         **************/
+
+        // Using generic implementation because ARM doe snot provide intrinsics
+        // for this operation
+
+        /**************
+         * reduce_min *
+         **************/
+
+        // Using generic implementation because ARM doe snot provide intrinsics
+        // for this operation
+
+        /**********
+         * select *
+         **********/
+
+        namespace wrap
+        {
+            inline uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) noexcept { return ::vbslq_u8(a, b, c); }
+            inline int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) noexcept { return ::vbslq_s8(a, b, c); }
+            inline uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) noexcept { return ::vbslq_u16(a, b, c); }
+            inline int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) noexcept { return ::vbslq_s16(a, b, c); }
+            inline uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) noexcept { return ::vbslq_u32(a, b, c); }
+            inline int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) noexcept { return ::vbslq_s32(a, b, c); }
+            inline uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) noexcept { return ::vbslq_u64(a, b, c); }
+            inline int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) noexcept { return ::vbslq_s64(a, b, c); }
+            inline float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) noexcept { return ::vbslq_f32(a, b, c); }
+        }
+
+        namespace detail
+        {
+            template <class... T>
+            struct neon_select_dispatcher_impl
+            {
+                using container_type = std::tuple<T (*)(comp_return_type<T>, T, T)...>;
+                const container_type m_func;
+
+                template <class U>
+                U apply(comp_return_type<U> cond, U lhs, U rhs) const noexcept
+                {
+                    using func_type = U (*)(comp_return_type<U>, U, U);
+                    auto func = xsimd::detail::get<func_type>(m_func);
+                    return func(cond, lhs, rhs);
+                }
+            };
+
+            using neon_select_dispatcher = neon_select_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                                       uint16x8_t, int16x8_t,
+                                                                       uint32x4_t, int32x4_t,
+                                                                       uint64x2_t, int64x2_t,
+                                                                       float32x4_t>;
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<neon>) noexcept
+        {
+            using bool_register_type = typename batch_bool<T, A>::register_type;
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_select_dispatcher dispatcher = {
+                std::make_tuple(wrap::vbslq_u8, wrap::vbslq_s8, wrap::vbslq_u16, wrap::vbslq_s16,
+                                wrap::vbslq_u32, wrap::vbslq_s32, wrap::vbslq_u64, wrap::vbslq_s64,
+                                wrap::vbslq_f32)
+            };
+            return dispatcher.apply(bool_register_type(cond), register_type(a), register_type(b));
+        }
+
+        template <class A, class T, bool... b, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<neon>) noexcept
+        {
+            return select(batch_bool<T, A> { b... }, true_br, false_br, neon {});
+        }
+
+        /**********
+         * zip_lo *
+         **********/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint8x8x2_t tmp = vzip_u8(vget_low_u8(lhs), vget_low_u8(rhs));
+            return vcombine_u8(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int8x8x2_t tmp = vzip_s8(vget_low_s8(lhs), vget_low_s8(rhs));
+            return vcombine_s8(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint16x4x2_t tmp = vzip_u16(vget_low_u16(lhs), vget_low_u16(rhs));
+            return vcombine_u16(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int16x4x2_t tmp = vzip_s16(vget_low_s16(lhs), vget_low_s16(rhs));
+            return vcombine_s16(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint32x2x2_t tmp = vzip_u32(vget_low_u32(lhs), vget_low_u32(rhs));
+            return vcombine_u32(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int32x2x2_t tmp = vzip_s32(vget_low_s32(lhs), vget_low_s32(rhs));
+            return vcombine_s32(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vcombine_u64(vget_low_u64(lhs), vget_low_u64(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vcombine_s64(vget_low_s64(lhs), vget_low_s64(rhs));
+        }
+
+        template <class A>
+        inline batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            float32x2x2_t tmp = vzip_f32(vget_low_f32(lhs), vget_low_f32(rhs));
+            return vcombine_f32(tmp.val[0], tmp.val[1]);
+        }
+
+        /**********
+         * zip_hi *
+         **********/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint8x8x2_t tmp = vzip_u8(vget_high_u8(lhs), vget_high_u8(rhs));
+            return vcombine_u8(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int8x8x2_t tmp = vzip_s8(vget_high_s8(lhs), vget_high_s8(rhs));
+            return vcombine_s8(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint16x4x2_t tmp = vzip_u16(vget_high_u16(lhs), vget_high_u16(rhs));
+            return vcombine_u16(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int16x4x2_t tmp = vzip_s16(vget_high_s16(lhs), vget_high_s16(rhs));
+            return vcombine_s16(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint32x2x2_t tmp = vzip_u32(vget_high_u32(lhs), vget_high_u32(rhs));
+            return vcombine_u32(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int32x2x2_t tmp = vzip_s32(vget_high_s32(lhs), vget_high_s32(rhs));
+            return vcombine_s32(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vcombine_u64(vget_high_u64(lhs), vget_high_u64(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vcombine_s64(vget_high_s64(lhs), vget_high_s64(rhs));
+        }
+
+        template <class A>
+        inline batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            float32x2x2_t tmp = vzip_f32(vget_high_f32(lhs), vget_high_f32(rhs));
+            return vcombine_f32(tmp.val[0], tmp.val[1]);
+        }
+
+        /****************
+         * extract_pair *
+         ****************/
+
+        namespace detail
+        {
+            template <class A, class T>
+            inline batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
+            {
+                assert(false && "extract_pair out of bounds");
+                return batch<T, A> {};
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
+            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_u8(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 1> = 0>
+            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_s8(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
+            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_u16(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 2> = 0>
+            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_s16(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
+            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_u32(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 4> = 0>
+            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_s32(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
+            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_u64(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 8> = 0>
+            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_s64(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, size_t I, size_t... Is>
+            inline batch<float, A> extract_pair(batch<float, A> const& lhs, batch<float, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_f32(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t... Is>
+            inline batch<T, A> extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) noexcept
+            {
+                if (n == 0)
+                {
+                    return rhs;
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+        }
+
+        template <class A, class T>
+        inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<neon>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            assert(n < size && "index in bounds");
+            return detail::extract_pair_impl(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
+        }
+
+        /******************
+         * bitwise_lshift *
+         ******************/
+
+        namespace detail
+        {
+            template <class A, class T>
+            inline batch<T, A> bitwise_lshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept
+            {
+                assert(false && "bitwise_lshift out of bounds");
+                return batch<T, A> {};
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
+            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_u8(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0>
+            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_s8(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
+            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_u16(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0>
+            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_s16(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
+            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_u32(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0>
+            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_s32(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
+            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_u64(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0>
+            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_s64(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int... Is>
+            inline batch<T, A> bitwise_lshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept
+            {
+                if (n == 0)
+                {
+                    return lhs;
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+        }
+
+        template <class A, class T>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
+        {
+            constexpr int size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && n < size && "index in bounds");
+            return detail::bitwise_lshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>());
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u8(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s8(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u16(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s16(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u32(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s32(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s64(lhs, rhs);
+        }
+
+        /******************
+         * bitwise_rshift *
+         ******************/
+
+        namespace detail
+        {
+            template <class A, class T>
+            inline batch<T, A> bitwise_rshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept
+            {
+                assert(false && "bitwise_rshift out of bounds");
+                return batch<T, A> {};
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
+            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_u8(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0>
+            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_s8(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
+            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_u16(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0>
+            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_s16(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
+            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_u32(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0>
+            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_s32(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
+            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_u64(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0>
+            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_s64(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int... Is>
+            inline batch<T, A> bitwise_rshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept
+            {
+                if (n == 0)
+                {
+                    return lhs;
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+        }
+
+        template <class A, class T>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
+        {
+            constexpr int size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && n < size && "index in bounds");
+            return detail::bitwise_rshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>());
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u8(lhs, vnegq_s8(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s8(lhs, vnegq_s8(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u16(lhs, vnegq_s16(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s16(lhs, vnegq_s16(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u32(lhs, vnegq_s32(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s32(lhs, vnegq_s32(rhs));
+        }
+
+        // Overloads of bitwise shifts accepting two batches of uint64/int64 are not available with ARMv7
+
+        /*******
+         * all *
+         *******/
+
+        template <class A, class T, detail::enable_sized_t<T, 8> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            uint64x1_t tmp = vand_u64(vget_low_u64(arg), vget_high_u64(arg));
+            return vget_lane_u64(tmp, 0) == ~0ULL;
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 1> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 2> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 4> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
+        }
+
+        /*******
+         * any *
+         *******/
+
+        template <class A, class T, detail::enable_sized_t<T, 8> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            uint32x2_t tmp = vqmovn_u64(arg);
+            return vget_lane_u64(vreinterpret_u64_u32(tmp), 0) != 0;
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 1> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 2> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 4> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
+        }
+
+        /****************
+         * bitwise_cast *
+         ****************/
+
+#define WRAP_CAST(SUFFIX, TYPE)                                          \
+    namespace wrap                                                       \
+    {                                                                    \
+        inline TYPE vreinterpretq_##SUFFIX##_u8(uint8x16_t a) noexcept   \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_u8(a);                     \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_s8(int8x16_t a) noexcept    \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_s8(a);                     \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_u16(uint16x8_t a) noexcept  \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_u16(a);                    \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_s16(int16x8_t a) noexcept   \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_s16(a);                    \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_u32(uint32x4_t a) noexcept  \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_u32(a);                    \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_s32(int32x4_t a) noexcept   \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_s32(a);                    \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_u64(uint64x2_t a) noexcept  \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_u64(a);                    \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_s64(int64x2_t a) noexcept   \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_s64(a);                    \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_f32(float32x4_t a) noexcept \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_f32(a);                    \
+        }                                                                \
+    }
+
+        WRAP_CAST(u8, uint8x16_t)
+        WRAP_CAST(s8, int8x16_t)
+        WRAP_CAST(u16, uint16x8_t)
+        WRAP_CAST(s16, int16x8_t)
+        WRAP_CAST(u32, uint32x4_t)
+        WRAP_CAST(s32, int32x4_t)
+        WRAP_CAST(u64, uint64x2_t)
+        WRAP_CAST(s64, int64x2_t)
+        WRAP_CAST(f32, float32x4_t)
+
+#undef WRAP_CAST
+
+        namespace detail
+        {
+            template <class R, class... T>
+            struct bitwise_caster_impl
+            {
+                using container_type = std::tuple<R (*)(T)...>;
+                container_type m_func;
+
+                template <class U>
+                R apply(U rhs) const noexcept
+                {
+                    using func_type = R (*)(U);
+                    auto func = xsimd::detail::get<func_type>(m_func);
+                    return func(rhs);
+                }
+            };
+
+            template <class R, class... T>
+            inline const bitwise_caster_impl<R, T...> make_bitwise_caster_impl(R (*... arg)(T)) noexcept
+            {
+                return { std::make_tuple(arg...) };
+            }
+
+            template <class... T>
+            struct type_list
+            {
+            };
+
+            template <class RTL, class TTL>
+            struct bitwise_caster;
+
+            template <class... R, class... T>
+            struct bitwise_caster<type_list<R...>, type_list<T...>>
+            {
+                using container_type = std::tuple<bitwise_caster_impl<R, T...>...>;
+                container_type m_caster;
+
+                template <class V, class U>
+                V apply(U rhs) const noexcept
+                {
+                    using caster_type = bitwise_caster_impl<V, T...>;
+                    auto caster = xsimd::detail::get<caster_type>(m_caster);
+                    return caster.apply(rhs);
+                }
+            };
+
+            template <class... T>
+            using bitwise_caster_t = bitwise_caster<type_list<T...>, type_list<T...>>;
+
+            using neon_bitwise_caster = bitwise_caster_t<uint8x16_t, int8x16_t,
+                                                         uint16x8_t, int16x8_t,
+                                                         uint32x4_t, int32x4_t,
+                                                         uint64x2_t, int64x2_t,
+                                                         float32x4_t>;
+        }
+
+        template <class A, class T, class R>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<neon>) noexcept
+        {
+            const detail::neon_bitwise_caster caster = {
+                std::make_tuple(
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_u8_u8, wrap::vreinterpretq_u8_s8, wrap::vreinterpretq_u8_u16, wrap::vreinterpretq_u8_s16,
+                                                     wrap::vreinterpretq_u8_u32, wrap::vreinterpretq_u8_s32, wrap::vreinterpretq_u8_u64, wrap::vreinterpretq_u8_s64,
+                                                     wrap::vreinterpretq_u8_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_s8_u8, wrap::vreinterpretq_s8_s8, wrap::vreinterpretq_s8_u16, wrap::vreinterpretq_s8_s16,
+                                                     wrap::vreinterpretq_s8_u32, wrap::vreinterpretq_s8_s32, wrap::vreinterpretq_s8_u64, wrap::vreinterpretq_s8_s64,
+                                                     wrap::vreinterpretq_s8_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_u16_u8, wrap::vreinterpretq_u16_s8, wrap::vreinterpretq_u16_u16, wrap::vreinterpretq_u16_s16,
+                                                     wrap::vreinterpretq_u16_u32, wrap::vreinterpretq_u16_s32, wrap::vreinterpretq_u16_u64, wrap::vreinterpretq_u16_s64,
+                                                     wrap::vreinterpretq_u16_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_s16_u8, wrap::vreinterpretq_s16_s8, wrap::vreinterpretq_s16_u16, wrap::vreinterpretq_s16_s16,
+                                                     wrap::vreinterpretq_s16_u32, wrap::vreinterpretq_s16_s32, wrap::vreinterpretq_s16_u64, wrap::vreinterpretq_s16_s64,
+                                                     wrap::vreinterpretq_s16_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_u32_u8, wrap::vreinterpretq_u32_s8, wrap::vreinterpretq_u32_u16, wrap::vreinterpretq_u32_s16,
+                                                     wrap::vreinterpretq_u32_u32, wrap::vreinterpretq_u32_s32, wrap::vreinterpretq_u32_u64, wrap::vreinterpretq_u32_s64,
+                                                     wrap::vreinterpretq_u32_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_s32_u8, wrap::vreinterpretq_s32_s8, wrap::vreinterpretq_s32_u16, wrap::vreinterpretq_s32_s16,
+                                                     wrap::vreinterpretq_s32_u32, wrap::vreinterpretq_s32_s32, wrap::vreinterpretq_s32_u64, wrap::vreinterpretq_s32_s64,
+                                                     wrap::vreinterpretq_s32_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_u64_u8, wrap::vreinterpretq_u64_s8, wrap::vreinterpretq_u64_u16, wrap::vreinterpretq_u64_s16,
+                                                     wrap::vreinterpretq_u64_u32, wrap::vreinterpretq_u64_s32, wrap::vreinterpretq_u64_u64, wrap::vreinterpretq_u64_s64,
+                                                     wrap::vreinterpretq_u64_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_s64_u8, wrap::vreinterpretq_s64_s8, wrap::vreinterpretq_s64_u16, wrap::vreinterpretq_s64_s16,
+                                                     wrap::vreinterpretq_s64_u32, wrap::vreinterpretq_s64_s32, wrap::vreinterpretq_s64_u64, wrap::vreinterpretq_s64_s64,
+                                                     wrap::vreinterpretq_s64_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_f32_u8, wrap::vreinterpretq_f32_s8, wrap::vreinterpretq_f32_u16, wrap::vreinterpretq_f32_s16,
+                                                     wrap::vreinterpretq_f32_u32, wrap::vreinterpretq_f32_s32, wrap::vreinterpretq_f32_u64, wrap::vreinterpretq_f32_s64,
+                                                     wrap::vreinterpretq_f32_f32))
+            };
+            using src_register_type = typename batch<T, A>::register_type;
+            using dst_register_type = typename batch<R, A>::register_type;
+            return caster.apply<dst_register_type>(src_register_type(arg));
+        }
+
+        /*********
+         * isnan *
+         *********/
+
+        template <class A>
+        inline batch_bool<float, A> isnan(batch<float, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return !(arg == arg);
+        }
+
+        // slide_left
+        namespace detail
+        {
+            template <size_t N>
+            struct slider_left
+            {
+                template <class A, class T>
+                inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
+                {
+                    const auto left = vdupq_n_u8(0);
+                    const auto right = bitwise_cast<uint8_t>(x).data;
+                    const batch<uint8_t, A> res(vextq_u8(left, right, 16 - N));
+                    return bitwise_cast<T>(res);
+                }
+            };
+
+            template <>
+            struct slider_left<0>
+            {
+                template <class A, class T>
+                inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
+                {
+                    return x;
+                }
+            };
+        } // namespace detail
+
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return detail::slider_left<N> {}(x, A {});
+        }
+
+        // slide_right
+        namespace detail
+        {
+            template <size_t N>
+            struct slider_right
+            {
+                template <class A, class T>
+                inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
+                {
+                    const auto left = bitwise_cast<uint8_t>(x).data;
+                    const auto right = vdupq_n_u8(0);
+                    const batch<uint8_t, A> res(vextq_u8(left, right, N));
+                    return bitwise_cast<T>(res);
+                }
+            };
+
+            template <>
+            struct slider_right<16>
+            {
+                template <class A, class T>
+                inline batch<T, A> operator()(batch<T, A> const&, requires_arch<neon>) noexcept
+                {
+                    return batch<T, A> {};
+                }
+            };
+        } // namespace detail
+
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return detail::slider_right<N> {}(x, A {});
+        }
+    }
+
+    template <class batch_type, typename batch_type::value_type... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        /***********
+         * swizzle *
+         ***********/
+
+        template <class A, class T, class I, I... idx>
+        inline batch<T, A> swizzle(batch<T, A> const& self,
+                                   batch_constant<batch<I, A>, idx...>,
+                                   requires_arch<neon>) noexcept
+        {
+            static_assert(batch<T, A>::size == sizeof...(idx), "valid swizzle indices");
+            std::array<T, batch<T, A>::size> data;
+            self.store_aligned(data.data());
+            return set(batch<T, A>(), A(), data[idx]...);
+        }
+    }
+}
+
+#undef WRAP_BINARY_INT_EXCLUDING_64
+#undef WRAP_BINARY_INT
+#undef WRAP_BINARY_FLOAT
+#undef WRAP_UNARY_INT_EXCLUDING_64
+#undef WRAP_UNARY_INT
+#undef WRAP_UNARY_FLOAT
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
new file mode 100644
index 0000000000..31ab6210bd
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
@@ -0,0 +1,1322 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NEON64_HPP
+#define XSIMD_NEON64_HPP
+
+#include <complex>
+#include <cstddef>
+#include <tuple>
+
+#include "../types/xsimd_neon64_register.hpp"
+#include "../types/xsimd_utils.hpp"
+
+namespace xsimd
+{
+    template <class batch_type, bool... Values>
+    struct batch_bool_constant;
+
+    namespace kernel
+    {
+        using namespace types;
+
+        /*******
+         * all *
+         *******/
+
+        template <class A, class T, detail::enable_sized_t<T, 4> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return vminvq_u32(arg) == ~0U;
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 1> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 2> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 8> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {});
+        }
+
+        /*******
+         * any *
+         *******/
+
+        template <class A, class T, detail::enable_sized_t<T, 4> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return vmaxvq_u32(arg) != 0;
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 1> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 2> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 8> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {});
+        }
+
+        /*************
+         * broadcast *
+         *************/
+
+        // Required to avoid ambiguous call
+        template <class A, class T>
+        inline batch<T, A> broadcast(T val, requires_arch<neon64>) noexcept
+        {
+            return broadcast<neon64>(val, neon {});
+        }
+
+        template <class A>
+        inline batch<double, A> broadcast(double val, requires_arch<neon64>) noexcept
+        {
+            return vdupq_n_f64(val);
+        }
+
+        /*******
+         * set *
+         *******/
+
+        template <class A>
+        inline batch<double, A> set(batch<double, A> const&, requires_arch<neon64>, double d0, double d1) noexcept
+        {
+            return float64x2_t { d0, d1 };
+        }
+
+        template <class A>
+        inline batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<neon64>, bool b0, bool b1) noexcept
+        {
+            using register_type = typename batch_bool<double, A>::register_type;
+            using unsigned_type = as_unsigned_integer_t<double>;
+            return register_type { static_cast<unsigned_type>(b0 ? -1LL : 0LL),
+                                   static_cast<unsigned_type>(b1 ? -1LL : 0LL) };
+        }
+
+        /*************
+         * from_bool *
+         *************/
+
+        template <class A>
+        inline batch<double, A> from_bool(batch_bool<double, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(vandq_u64(arg, vreinterpretq_u64_f64(vdupq_n_f64(1.))));
+        }
+
+        /********
+         * load *
+         ********/
+#if defined(__clang__) || defined(__GNUC__)
+#define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
+#elif defined(_MSC_VER)
+#define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
+#else
+#define xsimd_aligned_load(inst, type, expr) inst((type)expr)
+#endif
+
+        template <class A>
+        inline batch<double, A> load_aligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_f64, double*, src);
+        }
+
+        template <class A>
+        inline batch<double, A> load_unaligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
+        {
+            return vld1q_f64(src);
+        }
+#undef xsimd_aligned_load
+
+        /*********
+         * store *
+         *********/
+
+        template <class A>
+        inline void store_aligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
+        {
+            vst1q_f64(dst, src);
+        }
+
+        template <class A>
+        inline void store_unaligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
+        {
+            return store_aligned<A>(dst, src, A {});
+        }
+
+        /****************
+         * load_complex *
+         ****************/
+
+        template <class A>
+        inline batch<std::complex<double>, A> load_complex_aligned(std::complex<double> const* mem, convert<std::complex<double>>, requires_arch<neon64>) noexcept
+        {
+            using real_batch = batch<double, A>;
+            const double* buf = reinterpret_cast<const double*>(mem);
+            float64x2x2_t tmp = vld2q_f64(buf);
+            real_batch real = tmp.val[0],
+                       imag = tmp.val[1];
+            return batch<std::complex<double>, A> { real, imag };
+        }
+
+        template <class A>
+        inline batch<std::complex<double>, A> load_complex_unaligned(std::complex<double> const* mem, convert<std::complex<double>> cvt, requires_arch<neon64>) noexcept
+        {
+            return load_complex_aligned<A>(mem, cvt, A {});
+        }
+
+        /*****************
+         * store_complex *
+         *****************/
+
+        template <class A>
+        inline void store_complex_aligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
+        {
+            float64x2x2_t tmp;
+            tmp.val[0] = src.real();
+            tmp.val[1] = src.imag();
+            double* buf = reinterpret_cast<double*>(dst);
+            vst2q_f64(buf, tmp);
+        }
+
+        template <class A>
+        inline void store_complex_unaligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
+        {
+            store_complex_aligned(dst, src, A {});
+        }
+
+        /*******
+         * neg *
+         *******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_u64_s64(vnegq_s64(vreinterpretq_s64_u64(rhs)));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vnegq_s64(rhs);
+        }
+
+        template <class A>
+        inline batch<double, A> neg(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vnegq_f64(rhs);
+        }
+
+        /*******
+         * add *
+         *******/
+
+        template <class A>
+        inline batch<double, A> add(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vaddq_f64(lhs, rhs);
+        }
+
+        /********
+         * sadd *
+         ********/
+
+        template <class A>
+        inline batch<double, A> sadd(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return add(lhs, rhs, neon64 {});
+        }
+
+        /*******
+         * sub *
+         *******/
+
+        template <class A>
+        inline batch<double, A> sub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vsubq_f64(lhs, rhs);
+        }
+
+        /********
+         * ssub *
+         ********/
+
+        template <class A>
+        inline batch<double, A> ssub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return sub(lhs, rhs, neon64 {});
+        }
+
+        /*******
+         * mul *
+         *******/
+
+        template <class A>
+        inline batch<double, A> mul(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vmulq_f64(lhs, rhs);
+        }
+
+        /*******
+         * div *
+         *******/
+
+#if defined(XSIMD_FAST_INTEGER_DIVISION)
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcvtq_u64_f64(vcvtq_f64_u64(lhs) / vcvtq_f64_u64(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcvtq_s64_f64(vcvtq_f64_s64(lhs) / vcvtq_f64_s64(rhs));
+        }
+#endif
+        template <class A>
+        inline batch<double, A> div(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vdivq_f64(lhs, rhs);
+        }
+
+        /******
+         * eq *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> eq(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_f64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_u64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> eq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_u64(lhs, rhs);
+        }
+
+        /*************
+         * fast_cast *
+         *************/
+        namespace detail
+        {
+            template <class A>
+            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
+            {
+                return vcvtq_f64_s64(x);
+            }
+
+            template <class A>
+            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
+            {
+                return vcvtq_f64_u64(x);
+            }
+
+            template <class A>
+            inline batch<int64_t, A> fast_cast(batch<double, A> const& x, batch<int64_t, A> const&, requires_arch<neon64>) noexcept
+            {
+                return vcvtq_s64_f64(x);
+            }
+
+            template <class A>
+            inline batch<uint64_t, A> fast_cast(batch<double, A> const& x, batch<uint64_t, A> const&, requires_arch<neon64>) noexcept
+            {
+                return vcvtq_u64_f64(x);
+            }
+
+        }
+
+        /******
+         * lt *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcltq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcltq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> lt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcltq_f64(lhs, rhs);
+        }
+
+        /******
+         * le *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcleq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcleq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> le(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcleq_f64(lhs, rhs);
+        }
+
+        /******
+         * gt *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgtq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgtq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> gt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgtq_f64(lhs, rhs);
+        }
+
+        /******
+         * ge *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgeq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgeq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> ge(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgeq_f64(lhs, rhs);
+        }
+
+        /*******************
+         * batch_bool_cast *
+         *******************/
+
+        template <class A, class T_out, class T_in>
+        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon64>) noexcept
+        {
+            using register_type = typename batch_bool<T_out, A>::register_type;
+            return register_type(self);
+        }
+
+        /***************
+         * bitwise_and *
+         ***************/
+
+        template <class A>
+        inline batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(lhs),
+                                                   vreinterpretq_u64_f64(rhs)));
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_and(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vandq_u64(lhs, rhs);
+        }
+
+        /**************
+         * bitwise_or *
+         **************/
+
+        template <class A>
+        inline batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(lhs),
+                                                   vreinterpretq_u64_f64(rhs)));
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_or(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vorrq_u64(lhs, rhs);
+        }
+
+        /***************
+         * bitwise_xor *
+         ***************/
+
+        template <class A>
+        inline batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(lhs),
+                                                   vreinterpretq_u64_f64(rhs)));
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return veorq_u64(lhs, rhs);
+        }
+
+        /*******
+         * neq *
+         *******/
+
+        template <class A>
+        inline batch_bool<double, A> neq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return bitwise_xor(lhs, rhs, A {});
+        }
+
+        /***************
+         * bitwise_not *
+         ***************/
+
+        template <class A>
+        inline batch<double, A> bitwise_not(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_f64(rhs)));
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_not(batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return detail::bitwise_not_u64(rhs);
+        }
+
+        /******************
+         * bitwise_andnot *
+         ******************/
+
+        template <class A>
+        inline batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(lhs),
+                                                   vreinterpretq_u64_f64(rhs)));
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vbicq_u64(lhs, rhs);
+        }
+
+        /*******
+         * min *
+         *******/
+
+        template <class A>
+        inline batch<double, A> min(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vminq_f64(lhs, rhs);
+        }
+
+        /*******
+         * max *
+         *******/
+
+        template <class A>
+        inline batch<double, A> max(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vmaxq_f64(lhs, rhs);
+        }
+
+        /*******
+         * abs *
+         *******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return rhs;
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vabsq_s64(rhs);
+        }
+
+        template <class A>
+        inline batch<double, A> abs(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vabsq_f64(rhs);
+        }
+
+        template <class A>
+        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                  requires_arch<neon64>) noexcept
+        {
+            return vcvtnq_s32_f32(self);
+        }
+
+#if !defined(__GNUC__)
+        template <class A>
+        inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
+                                                  requires_arch<neon64>) noexcept
+        {
+            return vcvtnq_s64_f64(self);
+        }
+#endif
+
+        /**************
+         * reciprocal *
+         **************/
+
+        template <class A>
+        inline batch<double, A>
+        reciprocal(const batch<double, A>& x,
+                   kernel::requires_arch<neon64>) noexcept
+        {
+            return vrecpeq_f64(x);
+        }
+
+        /********
+         * rsqrt *
+         ********/
+
+        template <class A>
+        inline batch<double, A> rsqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vrsqrteq_f64(rhs);
+        }
+
+        /********
+         * sqrt *
+         ********/
+
+        template <class A>
+        inline batch<double, A> sqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vsqrtq_f64(rhs);
+        }
+
+        /********************
+         * Fused operations *
+         ********************/
+
+#ifdef __ARM_FEATURE_FMA
+        template <class A>
+        inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
+        {
+            return vfmaq_f64(z, x, y);
+        }
+
+        template <class A>
+        inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
+        {
+            return vfmaq_f64(-z, x, y);
+        }
+#endif
+
+        /*********
+         * haddp *
+         *********/
+
+        template <class A>
+        inline batch<double, A> haddp(const batch<double, A>* row, requires_arch<neon64>) noexcept
+        {
+            return vpaddq_f64(row[0], row[1]);
+        }
+
+        /**********
+         * insert *
+         **********/
+
+        template <class A, size_t I>
+        inline batch<double, A> insert(batch<double, A> const& self, double val, index<I>, requires_arch<neon64>) noexcept
+        {
+            return vsetq_lane_f64(val, self, I);
+        }
+
+        /******************
+         * reducer macros *
+         ******************/
+
+        // Wrap reducer intrinsics so we can pass them as function pointers
+        // - OP: intrinsics name prefix, e.g., vorrq
+
+#define WRAP_REDUCER_INT_EXCLUDING_64(OP)               \
+    namespace wrap                                      \
+    {                                                   \
+        inline uint8_t OP##_u8(uint8x16_t a) noexcept   \
+        {                                               \
+            return ::OP##_u8(a);                        \
+        }                                               \
+        inline int8_t OP##_s8(int8x16_t a) noexcept     \
+        {                                               \
+            return ::OP##_s8(a);                        \
+        }                                               \
+        inline uint16_t OP##_u16(uint16x8_t a) noexcept \
+        {                                               \
+            return ::OP##_u16(a);                       \
+        }                                               \
+        inline int16_t OP##_s16(int16x8_t a) noexcept   \
+        {                                               \
+            return ::OP##_s16(a);                       \
+        }                                               \
+        inline uint32_t OP##_u32(uint32x4_t a) noexcept \
+        {                                               \
+            return ::OP##_u32(a);                       \
+        }                                               \
+        inline int32_t OP##_s32(int32x4_t a) noexcept   \
+        {                                               \
+            return ::OP##_s32(a);                       \
+        }                                               \
+    }
+
+#define WRAP_REDUCER_INT(OP)                            \
+    WRAP_REDUCER_INT_EXCLUDING_64(OP)                   \
+    namespace wrap                                      \
+    {                                                   \
+        inline uint64_t OP##_u64(uint64x2_t a) noexcept \
+        {                                               \
+            return ::OP##_u64(a);                       \
+        }                                               \
+        inline int64_t OP##_s64(int64x2_t a) noexcept   \
+        {                                               \
+            return ::OP##_s64(a);                       \
+        }                                               \
+    }
+
+#define WRAP_REDUCER_FLOAT(OP)                         \
+    namespace wrap                                     \
+    {                                                  \
+        inline float OP##_f32(float32x4_t a) noexcept  \
+        {                                              \
+            return ::OP##_f32(a);                      \
+        }                                              \
+        inline double OP##_f64(float64x2_t a) noexcept \
+        {                                              \
+            return ::OP##_f64(a);                      \
+        }                                              \
+    }
+
+        namespace detail
+        {
+            template <class R>
+            struct reducer_return_type_impl;
+
+            template <>
+            struct reducer_return_type_impl<uint8x16_t>
+            {
+                using type = uint8_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<int8x16_t>
+            {
+                using type = int8_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<uint16x8_t>
+            {
+                using type = uint16_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<int16x8_t>
+            {
+                using type = int16_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<uint32x4_t>
+            {
+                using type = uint32_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<int32x4_t>
+            {
+                using type = int32_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<uint64x2_t>
+            {
+                using type = uint64_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<int64x2_t>
+            {
+                using type = int64_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<float32x4_t>
+            {
+                using type = float;
+            };
+
+            template <>
+            struct reducer_return_type_impl<float64x2_t>
+            {
+                using type = double;
+            };
+
+            template <class R>
+            using reducer_return_type = typename reducer_return_type_impl<R>::type;
+
+            template <class... T>
+            struct neon_reducer_dispatcher_impl : neon_dispatcher_base<reducer_return_type, T...>
+            {
+            };
+
+            using neon_reducer_dispatcher = neon_reducer_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                                         uint16x8_t, int16x8_t,
+                                                                         uint32x4_t, int32x4_t,
+                                                                         uint64x2_t, int64x2_t,
+                                                                         float32x4_t, float64x2_t>;
+            template <class T>
+            using enable_neon64_type_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value,
+                                                                 int>::type;
+        }
+
+        /**************
+         * reduce_add *
+         **************/
+
+        WRAP_REDUCER_INT(vaddvq)
+        WRAP_REDUCER_FLOAT(vaddvq)
+
+        template <class A, class T, detail::enable_neon64_type_t<T> = 0>
+        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_reducer_dispatcher::unary dispatcher = {
+                std::make_tuple(wrap::vaddvq_u8, wrap::vaddvq_s8, wrap::vaddvq_u16, wrap::vaddvq_s16,
+                                wrap::vaddvq_u32, wrap::vaddvq_s32, wrap::vaddvq_u64, wrap::vaddvq_s64,
+                                wrap::vaddvq_f32, wrap::vaddvq_f64)
+            };
+            return dispatcher.apply(register_type(arg));
+        }
+
+        /**************
+         * reduce_max *
+         **************/
+
+        WRAP_REDUCER_INT_EXCLUDING_64(vmaxvq)
+        WRAP_REDUCER_FLOAT(vmaxvq)
+
+        namespace wrap
+        {
+            inline uint64_t vmaxvq_u64(uint64x2_t a) noexcept
+            {
+                return std::max(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1));
+            }
+
+            inline int64_t vmaxvq_s64(int64x2_t a) noexcept
+            {
+                return std::max(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1));
+            }
+        }
+
+        template <class A, class T, detail::enable_neon64_type_t<T> = 0>
+        inline typename batch<T, A>::value_type reduce_max(batch<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_reducer_dispatcher::unary dispatcher = {
+                std::make_tuple(wrap::vmaxvq_u8, wrap::vmaxvq_s8, wrap::vmaxvq_u16, wrap::vmaxvq_s16,
+                                wrap::vmaxvq_u32, wrap::vmaxvq_s32, wrap::vmaxvq_u64, wrap::vmaxvq_s64,
+                                wrap::vmaxvq_f32, wrap::vmaxvq_f64)
+            };
+            return dispatcher.apply(register_type(arg));
+        }
+
+        /**************
+         * reduce_min *
+         **************/
+
+        WRAP_REDUCER_INT_EXCLUDING_64(vminvq)
+        WRAP_REDUCER_FLOAT(vminvq)
+
+        namespace wrap
+        {
+            inline uint64_t vminvq_u64(uint64x2_t a) noexcept
+            {
+                return std::min(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1));
+            }
+
+            inline int64_t vminvq_s64(int64x2_t a) noexcept
+            {
+                return std::min(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1));
+            }
+        }
+
+        template <class A, class T, detail::enable_neon64_type_t<T> = 0>
+        inline typename batch<T, A>::value_type reduce_min(batch<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_reducer_dispatcher::unary dispatcher = {
+                std::make_tuple(wrap::vminvq_u8, wrap::vminvq_s8, wrap::vminvq_u16, wrap::vminvq_s16,
+                                wrap::vminvq_u32, wrap::vminvq_s32, wrap::vminvq_u64, wrap::vminvq_s64,
+                                wrap::vminvq_f32, wrap::vminvq_f64)
+            };
+            return dispatcher.apply(register_type(arg));
+        }
+
+#undef WRAP_REDUCER_INT_EXCLUDING_64
+#undef WRAP_REDUCER_INT
+#undef WRAP_REDUCER_FLOAT
+
+        /**********
+         * select *
+         **********/
+
+        template <class A>
+        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& a, batch<double, A> const& b, requires_arch<neon64>) noexcept
+        {
+            return vbslq_f64(cond, a, b);
+        }
+
+        template <class A, bool... b>
+        inline batch<double, A> select(batch_bool_constant<batch<double, A>, b...> const&,
+                                       batch<double, A> const& true_br,
+                                       batch<double, A> const& false_br,
+                                       requires_arch<neon64>) noexcept
+        {
+            return select(batch_bool<double, A> { b... }, true_br, false_br, neon64 {});
+        }
+        /**********
+         * zip_lo *
+         **********/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch<double, A> zip_lo(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_f64(lhs, rhs);
+        }
+
+        /**********
+         * zip_hi *
+         **********/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch<double, A> zip_hi(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_f64(lhs, rhs);
+        }
+
+        /****************
+         * extract_pair *
+         ****************/
+
+        namespace detail
+        {
+            template <class A, size_t I, size_t... Is>
+            inline batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n,
+                                                 ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_f64(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+        }
+
+        template <class A>
+        inline batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n, requires_arch<neon64>) noexcept
+        {
+            constexpr std::size_t size = batch<double, A>::size;
+            assert(n < size && "index in bounds");
+            return detail::extract_pair(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
+        }
+
+        /******************
+         * bitwise_rshift *
+         ******************/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
+        {
+            return bitwise_rshift<A>(lhs, n, neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vshlq_u64(lhs, vnegq_s64(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
+        {
+            return bitwise_rshift<A>(lhs, n, neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vshlq_s64(lhs, vnegq_s64(rhs));
+        }
+
+        /****************
+         * bitwise_cast *
+         ****************/
+
+#define WRAP_CAST(SUFFIX, TYPE)                                          \
+    namespace wrap                                                       \
+    {                                                                    \
+        inline float64x2_t vreinterpretq_f64_##SUFFIX(TYPE a) noexcept   \
+        {                                                                \
+            return ::vreinterpretq_f64_##SUFFIX(a);                      \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_f64(float64x2_t a) noexcept \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_f64(a);                    \
+        }                                                                \
+    }
+
+        WRAP_CAST(u8, uint8x16_t)
+        WRAP_CAST(s8, int8x16_t)
+        WRAP_CAST(u16, uint16x8_t)
+        WRAP_CAST(s16, int16x8_t)
+        WRAP_CAST(u32, uint32x4_t)
+        WRAP_CAST(s32, int32x4_t)
+        WRAP_CAST(u64, uint64x2_t)
+        WRAP_CAST(s64, int64x2_t)
+        WRAP_CAST(f32, float32x4_t)
+
+#undef WRAP_CAST
+
+        template <class A, class T>
+        inline batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
+        {
+            using caster_type = detail::bitwise_caster_impl<float64x2_t,
+                                                            uint8x16_t, int8x16_t,
+                                                            uint16x8_t, int16x8_t,
+                                                            uint32x4_t, int32x4_t,
+                                                            uint64x2_t, int64x2_t,
+                                                            float32x4_t>;
+            const caster_type caster = {
+                std::make_tuple(wrap::vreinterpretq_f64_u8, wrap::vreinterpretq_f64_s8, wrap::vreinterpretq_f64_u16, wrap::vreinterpretq_f64_s16,
+                                wrap::vreinterpretq_f64_u32, wrap::vreinterpretq_f64_s32, wrap::vreinterpretq_f64_u64, wrap::vreinterpretq_f64_s64,
+                                wrap::vreinterpretq_f64_f32)
+            };
+            using register_type = typename batch<T, A>::register_type;
+            return caster.apply(register_type(arg));
+        }
+
+        namespace detail
+        {
+            template <class S, class... R>
+            struct bitwise_caster_neon64
+            {
+                using container_type = std::tuple<R (*)(S)...>;
+                container_type m_func;
+
+                template <class V>
+                V apply(float64x2_t rhs) const
+                {
+                    using func_type = V (*)(float64x2_t);
+                    auto func = xsimd::detail::get<func_type>(m_func);
+                    return func(rhs);
+                }
+            };
+        }
+
+        template <class A, class R>
+        inline batch<R, A> bitwise_cast(batch<double, A> const& arg, batch<R, A> const&, requires_arch<neon64>) noexcept
+        {
+            using caster_type = detail::bitwise_caster_neon64<float64x2_t,
+                                                              uint8x16_t, int8x16_t,
+                                                              uint16x8_t, int16x8_t,
+                                                              uint32x4_t, int32x4_t,
+                                                              uint64x2_t, int64x2_t,
+                                                              float32x4_t>;
+            const caster_type caster = {
+                std::make_tuple(wrap::vreinterpretq_u8_f64, wrap::vreinterpretq_s8_f64, wrap::vreinterpretq_u16_f64, wrap::vreinterpretq_s16_f64,
+                                wrap::vreinterpretq_u32_f64, wrap::vreinterpretq_s32_f64, wrap::vreinterpretq_u64_f64, wrap::vreinterpretq_s64_f64,
+                                wrap::vreinterpretq_f32_f64)
+            };
+            using src_register_type = typename batch<double, A>::register_type;
+            using dst_register_type = typename batch<R, A>::register_type;
+            return caster.apply<dst_register_type>(src_register_type(arg));
+        }
+
+        template <class A>
+        inline batch<double, A> bitwise_cast(batch<double, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
+        {
+            return arg;
+        }
+
+        /*********
+         * isnan *
+         *********/
+
+        template <class A>
+        inline batch_bool<double, A> isnan(batch<double, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return !(arg == arg);
+        }
+    }
+
+    template <class batch_type, typename batch_type::value_type... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        /***********
+         * swizzle *
+         ***********/
+
+        namespace detail
+        {
+            using ::xsimd::batch_constant;
+            using ::xsimd::detail::integer_sequence;
+            using ::xsimd::detail::make_integer_sequence;
+
+            template <class CB1, class CB2, class IS>
+            struct index_burst_impl;
+
+            template <class B1, class B2, typename B2::value_type... V,
+                      typename B2::value_type... incr>
+            struct index_burst_impl<batch_constant<B1>, batch_constant<B2, V...>,
+                                    integer_sequence<typename B2::value_type, incr...>>
+            {
+                using type = batch_constant<B2, V...>;
+            };
+
+            template <class B1, typename B1::value_type V0, typename B1::value_type... V1,
+                      class B2, typename B2::value_type... V2,
+                      typename B2::value_type... incr>
+            struct index_burst_impl<batch_constant<B1, V0, V1...>, batch_constant<B2, V2...>,
+                                    integer_sequence<typename B2::value_type, incr...>>
+            {
+                using value_type = typename B2::value_type;
+                using next_input = batch_constant<B1, V1...>;
+                using next_output = batch_constant<B2, V2..., (V0 + incr)...>;
+                using type = typename index_burst_impl<next_input, next_output, integer_sequence<value_type, incr...>>::type;
+            };
+
+            template <class B, class T>
+            struct index_burst;
+
+            template <class B, typename B::value_type... V, class T>
+            struct index_burst<batch_constant<B, V...>, T>
+            {
+                static constexpr size_t mul = sizeof(typename B::value_type) / sizeof(T);
+                using input = batch_constant<B, (mul * V)...>;
+                using output = batch_constant<batch<T, typename B::arch_type>>;
+                using type = typename index_burst_impl<input, output, make_integer_sequence<T, mul>>::type;
+            };
+
+            template <class B, class T>
+            using index_burst_t = typename index_burst<B, T>::type;
+
+            template <class T, class B>
+            inline index_burst_t<B, T> burst_index(B)
+            {
+                return index_burst_t<B, T>();
+            }
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self,
+                                         batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
+                                         requires_arch<neon64>) noexcept
+        {
+            return vqtbl1q_u8(self, batch<uint8_t, A>(idx));
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self,
+                                        batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
+                                        requires_arch<neon64>) noexcept
+        {
+            return vqtbl1q_s8(self, batch<uint8_t, A>(idx));
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
+                                          batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx,
+                                          requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_u16_u8(swizzle<A>(batch_type(vreinterpretq_u8_u16(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
+                                         batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx,
+                                         requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<int8_t, A>;
+            return vreinterpretq_s16_s8(swizzle<A>(batch_type(vreinterpretq_s8_s16(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
+                                          batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                          requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_u32_u8(swizzle<A>(batch_type(vreinterpretq_u8_u32(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
+                                         batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                         requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<int8_t, A>;
+            return vreinterpretq_s32_s8(swizzle<A>(batch_type(vreinterpretq_s8_s32(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
+                                          batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                          requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_u64_u8(swizzle<A>(batch_type(vreinterpretq_u8_u64(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
+                                         batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                         requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<int8_t, A>;
+            return vreinterpretq_s64_s8(swizzle<A>(batch_type(vreinterpretq_s8_s64(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<float, A> swizzle(batch<float, A> const& self,
+                                       batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                       requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_f32_u8(swizzle<A>(batch_type(vreinterpretq_u8_f32(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<double, A> swizzle(batch<double, A> const& self,
+                                        batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                        requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_f64_u8(swizzle<A>(batch_type(vreinterpretq_u8_f64(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<std::complex<float>, A> swizzle(batch<std::complex<float>, A> const& self,
+                                                     batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                                     requires_arch<neon64>) noexcept
+        {
+            return batch<std::complex<float>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<std::complex<double>, A> swizzle(batch<std::complex<double>, A> const& self,
+                                                      batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                                      requires_arch<neon64>) noexcept
+        {
+            return batch<std::complex<double>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
+        }
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
new file mode 100644
index 0000000000..d5116cbd71
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
@@ -0,0 +1,1043 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SCALAR_HPP
+#define XSIMD_SCALAR_HPP
+
+#include <cassert>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+#include "xtl/xcomplex.hpp"
+#endif
+
+namespace xsimd
+{
+    template <class T, class A>
+    class batch;
+    template <class T, class A>
+    class batch_bool;
+
+    using std::abs;
+
+    using std::acos;
+    using std::acosh;
+    using std::arg;
+    using std::asin;
+    using std::asinh;
+    using std::atan;
+    using std::atan2;
+    using std::atanh;
+    using std::cbrt;
+    using std::ceil;
+    using std::conj;
+    using std::copysign;
+    using std::cos;
+    using std::cosh;
+    using std::erf;
+    using std::erfc;
+    using std::exp;
+    using std::exp2;
+    using std::expm1;
+    using std::fabs;
+    using std::fdim;
+    using std::floor;
+    using std::fmax;
+    using std::fmin;
+    using std::fmod;
+    using std::hypot;
+    using std::ldexp;
+    using std::lgamma;
+    using std::log;
+    using std::log10;
+    using std::log1p;
+    using std::log2;
+    using std::modf;
+    using std::nearbyint;
+    using std::nextafter;
+    using std::norm;
+    using std::polar;
+    using std::proj;
+    using std::remainder;
+    using std::rint;
+    using std::round;
+    using std::sin;
+    using std::sinh;
+    using std::sqrt;
+    using std::tan;
+    using std::tanh;
+    using std::tgamma;
+    using std::trunc;
+
+#ifndef _WIN32
+    using std::isfinite;
+    using std::isinf;
+    using std::isnan;
+#else
+
+    // Windows defines catch all templates
+    template <class T>
+    inline typename std::enable_if<std::is_floating_point<T>::value, bool>::type
+    isfinite(T var) noexcept
+    {
+        return std::isfinite(var);
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, bool>::type
+    isfinite(T var) noexcept
+    {
+        return isfinite(double(var));
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_floating_point<T>::value, bool>::type
+    isinf(T var) noexcept
+    {
+        return std::isinf(var);
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, bool>::type
+    isinf(T var) noexcept
+    {
+        return isinf(double(var));
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_floating_point<T>::value, bool>::type
+    isnan(T var) noexcept
+    {
+        return std::isnan(var);
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, bool>::type
+    isnan(T var) noexcept
+    {
+        return isnan(double(var));
+    }
+#endif
+
+    template <class T, class Tp>
+    inline auto add(T const& x, Tp const& y) noexcept -> decltype(x + y)
+    {
+        return x + y;
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, T>::type
+    bitwise_and(T x, T y) noexcept
+    {
+        return x & y;
+    }
+
+    inline float bitwise_and(float x, float y) noexcept
+    {
+        uint32_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(float));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(float));
+        uint32_t ir = bitwise_and(ix, iy);
+        float r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(float));
+        return r;
+    }
+
+    inline double bitwise_and(double x, double y) noexcept
+    {
+        uint64_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(double));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(double));
+        uint64_t ir = bitwise_and(ix, iy);
+        double r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(double));
+        return r;
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, T>::type
+    bitwise_andnot(T x, T y) noexcept
+    {
+        return x & ~y;
+    }
+
+    inline float bitwise_andnot(float x, float y) noexcept
+    {
+        uint32_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(float));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(float));
+        uint32_t ir = bitwise_andnot(ix, iy);
+        float r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(float));
+        return r;
+    }
+
+    inline double bitwise_andnot(double x, double y) noexcept
+    {
+        uint64_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(double));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(double));
+        uint64_t ir = bitwise_andnot(ix, iy);
+        double r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(double));
+        return r;
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, T>::type
+    bitwise_not(T x) noexcept
+    {
+        return ~x;
+    }
+
+    inline float bitwise_not(float x) noexcept
+    {
+        uint32_t ix;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(float));
+        uint32_t ir = bitwise_not(ix);
+        float r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(float));
+        return r;
+    }
+
+    inline double bitwise_not(double x) noexcept
+    {
+        uint64_t ix;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(double));
+        uint64_t ir = bitwise_not(ix);
+        double r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(double));
+        return r;
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, T>::type
+    bitwise_or(T x, T y) noexcept
+    {
+        return x | y;
+    }
+
+    inline float bitwise_or(float x, float y) noexcept
+    {
+        uint32_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(float));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(float));
+        uint32_t ir = bitwise_or(ix, iy);
+        float r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(float));
+        return r;
+    }
+
+    inline double bitwise_or(double x, double y) noexcept
+    {
+        uint64_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(double));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(double));
+        uint64_t ir = bitwise_or(ix, iy);
+        double r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(double));
+        return r;
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, T>::type
+    bitwise_xor(T x, T y) noexcept
+    {
+        return x ^ y;
+    }
+
+    inline float bitwise_xor(float x, float y) noexcept
+    {
+        uint32_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(float));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(float));
+        uint32_t ir = bitwise_xor(ix, iy);
+        float r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(float));
+        return r;
+    }
+
+    inline double bitwise_xor(double x, double y) noexcept
+    {
+        uint64_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(double));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(double));
+        uint64_t ir = bitwise_xor(ix, iy);
+        double r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(double));
+        return r;
+    }
+
+    template <class T, class Tp>
+    inline auto div(T const& x, Tp const& y) noexcept -> decltype(x / y)
+    {
+        return x / y;
+    }
+
+    template <class T, class Tp>
+    inline auto mod(T const& x, Tp const& y) noexcept -> decltype(x % y)
+    {
+        return x % y;
+    }
+
+    template <class T, class Tp>
+    inline auto mul(T const& x, Tp const& y) noexcept -> decltype(x * y)
+    {
+        return x * y;
+    }
+
+    template <class T>
+    inline auto neg(T const& x) noexcept -> decltype(-x)
+    {
+        return -x;
+    }
+
+    template <class T>
+    inline auto pos(T const& x) noexcept -> decltype(+x)
+    {
+        return +x;
+    }
+
+    inline float reciprocal(float const& x) noexcept
+    {
+        return 1.f / x;
+    }
+
+    inline double reciprocal(double const& x) noexcept
+    {
+        return 1. / x;
+    }
+
+#ifdef XSIMD_ENABLE_NUMPY_COMPLEX
+    template <class T>
+    inline bool isnan(std::complex<T> var) noexcept
+    {
+        return std::isnan(std::real(var)) || std::isnan(std::imag(var));
+    }
+
+    template <class T>
+    inline bool isinf(std::complex<T> var) noexcept
+    {
+        return std::isinf(std::real(var)) || std::isinf(std::imag(var));
+    }
+#endif
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    using xtl::abs;
+    using xtl::acos;
+    using xtl::acosh;
+    using xtl::asin;
+    using xtl::asinh;
+    using xtl::atan;
+    using xtl::atanh;
+    using xtl::cos;
+    using xtl::cosh;
+    using xtl::exp;
+    using xtl::log;
+    using xtl::log10;
+    using xtl::norm;
+    using xtl::pow;
+    using xtl::proj;
+    using xtl::sin;
+    using xtl::sinh;
+    using xtl::sqrt;
+    using xtl::tan;
+    using xtl::tanh;
+#endif
+
+    template <typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline T clip(const T& val, const T& low, const T& hi) noexcept
+    {
+        assert(low <= hi && "ordered clipping bounds");
+        return low > val ? low : (hi < val ? hi : val);
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool is_flint(const T& x) noexcept
+    {
+        return std::isnan(x - x) ? false : (x - std::trunc(x)) == T(0);
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool is_even(const T& x) noexcept
+    {
+        return is_flint(x * T(0.5));
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool is_odd(const T& x) noexcept
+    {
+        return is_even(x - 1.);
+    }
+
+    inline int32_t nearbyint_as_int(float var) noexcept
+    {
+        return static_cast<int32_t>(std::nearbyint(var));
+    }
+
+    inline int64_t nearbyint_as_int(double var) noexcept
+    {
+        return static_cast<int64_t>(std::nearbyint(var));
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool eq(const T& x0, const T& x1) noexcept
+    {
+        return x0 == x1;
+    }
+
+    template <class T>
+    inline bool eq(const std::complex<T>& x0, const std::complex<T>& x1) noexcept
+    {
+        return x0 == x1;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool ge(const T& x0, const T& x1) noexcept
+    {
+        return x0 >= x1;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool gt(const T& x0, const T& x1) noexcept
+    {
+        return x0 > x1;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool le(const T& x0, const T& x1) noexcept
+    {
+        return x0 <= x1;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool lt(const T& x0, const T& x1) noexcept
+    {
+        return x0 < x1;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool neq(const T& x0, const T& x1) noexcept
+    {
+        return x0 != x1;
+    }
+
+    template <class T>
+    inline bool neq(const std::complex<T>& x0, const std::complex<T>& x1) noexcept
+    {
+        return !(x0 == x1);
+    }
+
+#if defined(__APPLE__)
+    inline float exp10(const float& x) noexcept
+    {
+        return __exp10f(x);
+    }
+    inline double exp10(const double& x) noexcept
+    {
+        return __exp10(x);
+    }
+#elif defined(__GLIBC__)
+    inline float exp10(const float& x) noexcept
+    {
+        return ::exp10f(x);
+    }
+    inline double exp10(const double& x) noexcept
+    {
+        return ::exp10(x);
+    }
+#elif defined(_WIN32)
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline T exp10(const T& x) noexcept
+    {
+        // Very inefficient but other implementations give incorrect results
+        // on Windows
+        return std::pow(T(10), x);
+    }
+#else
+    inline float exp10(const float& x) noexcept
+    {
+        return std::exp(0x1.26bb1cp+1f * x);
+    }
+    inline double exp10(const double& x) noexcept
+    {
+        return std::exp(0x1.26bb1bbb55516p+1 * x);
+    }
+#endif
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline auto rsqrt(const T& x) noexcept -> decltype(std::sqrt(x))
+    {
+        using float_type = decltype(std::sqrt(x));
+        return static_cast<float_type>(1) / std::sqrt(x);
+    }
+
+    namespace detail
+    {
+        template <class C>
+        inline C expm1_complex_scalar_impl(const C& val) noexcept
+        {
+            using T = typename C::value_type;
+            T isin = std::sin(val.imag());
+            T rem1 = std::expm1(val.real());
+            T re = rem1 + T(1.);
+            T si = std::sin(val.imag() * T(0.5));
+            return std::complex<T>(rem1 - T(2.) * re * si * si, re * isin);
+        }
+    }
+
+    template <class T>
+    inline std::complex<T> expm1(const std::complex<T>& val) noexcept
+    {
+        return detail::expm1_complex_scalar_impl(val);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    inline xtl::xcomplex<T, T, i3ec> expm1(const xtl::xcomplex<T, T, i3ec>& val) noexcept
+    {
+        return detail::expm1_complex_scalar_impl(val);
+    }
+#endif
+
+    namespace detail
+    {
+        template <class C>
+        inline C log1p_complex_scalar_impl(const C& val) noexcept
+        {
+            using T = typename C::value_type;
+            C u = C(1.) + val;
+            return u == C(1.) ? val : (u.real() <= T(0.) ? log(u) : log(u) * val / (u - C(1.)));
+        }
+    }
+
+    template <class T>
+    inline std::complex<T> log1p(const std::complex<T>& val) noexcept
+    {
+        return detail::log1p_complex_scalar_impl(val);
+    }
+
+    template <class T>
+    inline std::complex<T> log2(const std::complex<T>& val) noexcept
+    {
+        return log(val) / std::log(T(2));
+    }
+
+    template <typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline T sadd(const T& lhs, const T& rhs) noexcept
+    {
+        if (std::numeric_limits<T>::is_signed)
+        {
+            if ((lhs > 0) && (rhs > std::numeric_limits<T>::max() - lhs))
+            {
+                return std::numeric_limits<T>::max();
+            }
+            else if ((lhs < 0) && (rhs < std::numeric_limits<T>::lowest() - lhs))
+            {
+                return std::numeric_limits<T>::lowest();
+            }
+            else
+            {
+                return lhs + rhs;
+            }
+        }
+        else
+        {
+            if (rhs > std::numeric_limits<T>::max() - lhs)
+            {
+                return std::numeric_limits<T>::max();
+            }
+            else
+            {
+                return lhs + rhs;
+            }
+        }
+    }
+
+    template <typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline T ssub(const T& lhs, const T& rhs) noexcept
+    {
+        if (std::numeric_limits<T>::is_signed)
+        {
+            return sadd(lhs, (T)-rhs);
+        }
+        else
+        {
+            if (lhs < rhs)
+            {
+                return std::numeric_limits<T>::lowest();
+            }
+            else
+            {
+                return lhs - rhs;
+            }
+        }
+    }
+
+    namespace detail
+    {
+        template <class T>
+        struct value_type_or_type_helper
+        {
+            using type = T;
+        };
+        template <class T, class A>
+        struct value_type_or_type_helper<batch<T, A>>
+        {
+            using type = T;
+        };
+
+        template <class T>
+        using value_type_or_type = typename value_type_or_type_helper<T>::type;
+
+        template <class T0, class T1>
+        inline typename std::enable_if<std::is_integral<T1>::value, T0>::type
+        ipow(const T0& x, const T1& n) noexcept
+        {
+            static_assert(std::is_integral<T1>::value, "second argument must be an integer");
+            T0 a = x;
+            T1 b = n;
+            bool const recip = b < 0;
+            T0 r(static_cast<value_type_or_type<T0>>(1));
+            while (1)
+            {
+                if (b & 1)
+                {
+                    r *= a;
+                }
+                b /= 2;
+                if (b == 0)
+                {
+                    break;
+                }
+                a *= a;
+            }
+            return recip ? static_cast<T0>(1) / r : r;
+        }
+    }
+
+    template <class T0, class T1>
+    inline typename std::enable_if<std::is_integral<T1>::value, T0>::type
+    pow(const T0& x, const T1& n) noexcept
+    {
+        return detail::ipow(x, n);
+    }
+
+    template <class T0, class T1>
+    inline auto
+    pow(const T0& t0, const T1& t1) noexcept
+        -> typename std::enable_if<std::is_scalar<T0>::value && std::is_floating_point<T1>::value, decltype(std::pow(t0, t1))>::type
+    {
+        return std::pow(t0, t1);
+    }
+
+    template <class T0, class T1>
+    inline typename std::enable_if<std::is_integral<T1>::value, std::complex<T0>>::type
+    pow(const std::complex<T0>& t0, const T1& t1) noexcept
+    {
+        return detail::ipow(t0, t1);
+    }
+
+    template <class T0, class T1>
+    inline typename std::enable_if<!std::is_integral<T1>::value, std::complex<T0>>::type
+    pow(const std::complex<T0>& t0, const T1& t1) noexcept
+    {
+        return std::pow(t0, t1);
+    }
+
+    template <class T0, class T1>
+    inline auto
+    pow(const T0& t0, const std::complex<T1>& t1) noexcept
+        -> typename std::enable_if<std::is_scalar<T0>::value, decltype(std::pow(t0, t1))>::type
+    {
+        return std::pow(t0, t1);
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool bitofsign(T const& x) noexcept
+    {
+        return x < T(0);
+    }
+
+    template <class T>
+    inline auto signbit(T const& v) noexcept -> decltype(bitofsign(v))
+    {
+        return bitofsign(v);
+    }
+
+    inline double sign(bool const& v) noexcept
+    {
+        return v;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline T sign(const T& v) noexcept
+    {
+        return v < T(0) ? T(-1.) : v == T(0) ? T(0.)
+                                             : T(1.);
+    }
+
+    namespace detail
+    {
+        template <class C>
+        inline C sign_complex_scalar_impl(const C& v) noexcept
+        {
+            using value_type = typename C::value_type;
+            if (v.real())
+            {
+                return C(sign(v.real()), value_type(0));
+            }
+            else
+            {
+                return C(sign(v.imag()), value_type(0));
+            }
+        }
+    }
+
+    template <class T>
+    inline std::complex<T> sign(const std::complex<T>& v) noexcept
+    {
+        return detail::sign_complex_scalar_impl(v);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    inline xtl::xcomplex<T, T, i3ec> sign(const xtl::xcomplex<T, T, i3ec>& v) noexcept
+    {
+        return detail::sign_complex_scalar_impl(v);
+    }
+#endif
+
+    inline double signnz(bool const&) noexcept
+    {
+        return 1;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline T signnz(const T& v) noexcept
+    {
+        return v < T(0) ? T(-1.) : T(1.);
+    }
+
+    template <class T, class Tp>
+    inline auto sub(T const& x, Tp const& y) noexcept -> decltype(x - y)
+    {
+        return x - y;
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    inline xtl::xcomplex<T, T, i3ec> log2(const xtl::xcomplex<T, T, i3ec>& val) noexcept
+    {
+        return log(val) / log(T(2));
+    }
+#endif
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    inline xtl::xcomplex<T, T, i3ec> log1p(const xtl::xcomplex<T, T, i3ec>& val) noexcept
+    {
+        return detail::log1p_complex_scalar_impl(val);
+    }
+#endif
+
+    template <class T0, class T1>
+    inline auto min(T0 const& self, T1 const& other) noexcept
+        -> typename std::enable_if<std::is_scalar<T0>::value && std::is_scalar<T1>::value,
+                                   typename std::decay<decltype(self > other ? other : self)>::type>::type
+    {
+        return self > other ? other : self;
+    }
+
+    // numpy defines minimum operator on complex using lexical comparison
+    template <class T0, class T1>
+    inline std::complex<typename std::common_type<T0, T1>::type>
+    min(std::complex<T0> const& self, std::complex<T1> const& other) noexcept
+    {
+        return (self.real() < other.real()) ? (self) : (self.real() == other.real() ? (self.imag() < other.imag() ? self : other) : other);
+    }
+
+    template <class T0, class T1>
+    inline auto max(T0 const& self, T1 const& other) noexcept
+        -> typename std::enable_if<std::is_scalar<T0>::value && std::is_scalar<T1>::value,
+                                   typename std::decay<decltype(self > other ? other : self)>::type>::type
+    {
+        return self < other ? other : self;
+    }
+
+    // numpy defines maximum operator on complex using lexical comparison
+    template <class T0, class T1>
+    inline std::complex<typename std::common_type<T0, T1>::type>
+    max(std::complex<T0> const& self, std::complex<T1> const& other) noexcept
+    {
+        return (self.real() > other.real()) ? (self) : (self.real() == other.real() ? (self.imag() > other.imag() ? self : other) : other);
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, T>::type fma(const T& a, const T& b, const T& c) noexcept
+    {
+        return a * b + c;
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_floating_point<T>::value, T>::type fma(const T& a, const T& b, const T& c) noexcept
+    {
+        return std::fma(a, b, c);
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_scalar<T>::value, T>::type fms(const T& a, const T& b, const T& c) noexcept
+    {
+        return a * b - c;
+    }
+
+    namespace detail
+    {
+        template <class C>
+        inline C fma_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
+        {
+            return { fms(a.real(), b.real(), fms(a.imag(), b.imag(), c.real())),
+                     fma(a.real(), b.imag(), fma(a.imag(), b.real(), c.imag())) };
+        }
+    }
+
+    template <class T>
+    inline std::complex<T> fma(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
+    {
+        return detail::fma_complex_scalar_impl(a, b, c);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    inline xtl::xcomplex<T, T, i3ec> fma(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
+    {
+        return detail::fma_complex_scalar_impl(a, b, c);
+    }
+#endif
+
+    namespace detail
+    {
+        template <class C>
+        inline C fms_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
+        {
+            return { fms(a.real(), b.real(), fma(a.imag(), b.imag(), c.real())),
+                     fma(a.real(), b.imag(), fms(a.imag(), b.real(), c.imag())) };
+        }
+    }
+
+    template <class T>
+    inline std::complex<T> fms(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
+    {
+        return detail::fms_complex_scalar_impl(a, b, c);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    inline xtl::xcomplex<T, T, i3ec> fms(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
+    {
+        return detail::fms_complex_scalar_impl(a, b, c);
+    }
+#endif
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, T>::type fnma(const T& a, const T& b, const T& c) noexcept
+    {
+        return -(a * b) + c;
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_floating_point<T>::value, T>::type fnma(const T& a, const T& b, const T& c) noexcept
+    {
+        return std::fma(-a, b, c);
+    }
+
+    namespace detail
+    {
+        template <class C>
+        inline C fnma_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
+        {
+            return { fms(a.imag(), b.imag(), fms(a.real(), b.real(), c.real())),
+                     -fma(a.real(), b.imag(), fms(a.imag(), b.real(), c.imag())) };
+        }
+    }
+
+    template <class T>
+    inline std::complex<T> fnma(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
+    {
+        return detail::fnma_complex_scalar_impl(a, b, c);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    inline xtl::xcomplex<T, T, i3ec> fnma(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
+    {
+        return detail::fnma_complex_scalar_impl(a, b, c);
+    }
+#endif
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, T>::type fnms(const T& a, const T& b, const T& c) noexcept
+    {
+        return -(a * b) - c;
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_floating_point<T>::value, T>::type fnms(const T& a, const T& b, const T& c) noexcept
+    {
+        return -std::fma(a, b, c);
+    }
+
+    namespace detail
+    {
+        template <class C>
+        inline C fnms_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
+        {
+            return { fms(a.imag(), b.imag(), fma(a.real(), b.real(), c.real())),
+                     -fma(a.real(), b.imag(), fma(a.imag(), b.real(), c.imag())) };
+        }
+    }
+
+    template <class T>
+    inline std::complex<T> fnms(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
+    {
+        return detail::fnms_complex_scalar_impl(a, b, c);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    inline xtl::xcomplex<T, T, i3ec> fnms(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
+    {
+        return detail::fnms_complex_scalar_impl(a, b, c);
+    }
+#endif
+
+    namespace detail
+    {
+#define XSIMD_HASSINCOS_TRAIT(func)                                                                                              \
+    template <class S>                                                                                                           \
+    struct has##func                                                                                                             \
+    {                                                                                                                            \
+        template <class T>                                                                                                       \
+        static auto get(T* ptr) -> decltype(func(std::declval<T>(), std::declval<T*>(), std::declval<T*>()), std::true_type {}); \
+        static std::false_type get(...);                                                                                         \
+        static constexpr bool value = decltype(get((S*)nullptr))::value;                                                         \
+    }
+
+#define XSIMD_HASSINCOS(func, T) has##func<T>::value
+
+        XSIMD_HASSINCOS_TRAIT(sincos);
+        XSIMD_HASSINCOS_TRAIT(sincosf);
+        XSIMD_HASSINCOS_TRAIT(__sincos);
+        XSIMD_HASSINCOS_TRAIT(__sincosf);
+
+        struct generic_sincosf
+        {
+            template <class T>
+            typename std::enable_if<XSIMD_HASSINCOS(sincosf, T), void>::type
+            operator()(float val, T& s, T& c)
+            {
+                sincosf(val, &s, &c);
+            }
+
+            template <class T>
+            typename std::enable_if<!XSIMD_HASSINCOS(sincosf, T) && XSIMD_HASSINCOS(__sincosf, T), void>::type
+            operator()(float val, T& s, T& c)
+            {
+                __sincosf(val, &s, &c);
+            }
+
+            template <class T>
+            typename std::enable_if<!XSIMD_HASSINCOS(sincosf, T) && !XSIMD_HASSINCOS(__sincosf, T), void>::type
+            operator()(float val, T& s, T& c)
+            {
+                s = std::sin(val);
+                c = std::cos(val);
+            }
+        };
+
+        struct generic_sincos
+        {
+            template <class T>
+            typename std::enable_if<XSIMD_HASSINCOS(sincos, T), void>::type
+            operator()(double val, T& s, T& c)
+            {
+                sincos(val, &s, &c);
+            }
+
+            template <class T>
+            typename std::enable_if<!XSIMD_HASSINCOS(sincos, T) && XSIMD_HASSINCOS(__sincos, T), void>::type
+            operator()(double val, T& s, T& c)
+            {
+                __sincos(val, &s, &c);
+            }
+
+            template <class T>
+            typename std::enable_if<!XSIMD_HASSINCOS(sincos, T) && !XSIMD_HASSINCOS(__sincos, T), void>::type
+            operator()(double val, T& s, T& c)
+            {
+                s = std::sin(val);
+                c = std::cos(val);
+            }
+        };
+
+#undef XSIMD_HASSINCOS_TRAIT
+#undef XSIMD_HASSINCOS
+    }
+
+    inline std::pair<float, float> sincos(float val) noexcept
+    {
+        float s, c;
+        detail::generic_sincosf {}(val, s, c);
+        return std::make_pair(s, c);
+    }
+
+    inline std::pair<double, double> sincos(double val) noexcept
+    {
+        double s, c;
+        detail::generic_sincos {}(val, s, c);
+        return std::make_pair(s, c);
+    }
+
+    template <class T>
+    inline std::pair<std::complex<T>, std::complex<T>>
+    sincos(const std::complex<T>& val) noexcept
+    {
+        return std::make_pair(std::sin(val), std::cos(val));
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T>
+    inline std::pair<xtl::xcomplex<T>, xtl::xcomplex<T>> sincos(const xtl::xcomplex<T>& val) noexcept
+    {
+        return std::make_pair(sin(val), cos(val));
+    }
+#endif
+
+    template <class T, class _ = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
+    inline T frexp(T const& val, int& exp) noexcept
+    {
+        return std::frexp(val, &exp);
+    }
+
+    template <class T>
+    inline T select(bool cond, T const& true_br, T const& false_br) noexcept
+    {
+        return cond ? true_br : false_br;
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
new file mode 100644
index 0000000000..e4949523ca
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
@@ -0,0 +1,1695 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE2_HPP
+#define XSIMD_SSE2_HPP
+
+#include <complex>
+#include <limits>
+#include <type_traits>
+
+#include "../types/xsimd_sse2_register.hpp"
+
+namespace xsimd
+{
+    template <class batch_type, bool... Values>
+    struct batch_bool_constant;
+
+    template <class T_out, class T_in, class A>
+    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
+
+    template <class batch_type, typename batch_type::value_type... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        using namespace types;
+
+        namespace detail
+        {
+            constexpr uint32_t shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z)
+            {
+                return (z << 6) | (y << 4) | (x << 2) | w;
+            }
+            constexpr uint32_t shuffle(uint32_t x, uint32_t y)
+            {
+                return (y << 1) | x;
+            }
+        }
+
+        // fwd
+        template <class A, class T, size_t I>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
+
+        // abs
+        template <class A>
+        inline batch<double, A> abs(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            __m128d sign_mask = _mm_set1_pd(-0.f); // -0.f = 1 << 31
+            return _mm_andnot_pd(sign_mask, self);
+        }
+        template <class A>
+        inline batch<float, A> abs(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
+            return _mm_andnot_ps(sign_mask, self);
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_add_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_add_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_add_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_add_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        template <class A>
+        inline batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_add_ps(self, other);
+        }
+
+        template <class A>
+        inline batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_add_pd(self, other);
+        }
+
+        // all
+        template <class A>
+        inline bool all(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_ps(self) == 0x0F;
+        }
+        template <class A>
+        inline bool all(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_pd(self) == 0x03;
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline bool all(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_epi8(self) == 0xFFFF;
+        }
+
+        // any
+        template <class A>
+        inline bool any(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_ps(self) != 0;
+        }
+        template <class A>
+        inline bool any(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_pd(self) != 0;
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline bool any(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_epi8(self) != 0;
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in>
+        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<sse2>) noexcept
+        {
+            return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
+        }
+
+        // bitwise_and
+        template <class A>
+        inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_si128(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_si128(self, other);
+        }
+
+        template <class A>
+        batch<double, A> inline bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_pd(self, other);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_pd(self, other);
+        }
+
+        // bitwise_andnot
+        template <class A>
+        inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_ps(other, self);
+        }
+
+        template <class A>
+        inline batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_ps(other, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_si128(other, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_si128(other, self);
+        }
+
+        template <class A>
+        inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_pd(other, self);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_pd(other, self);
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_and_si128(_mm_set1_epi8(0xFF << other), _mm_slli_epi32(self, other));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_slli_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_slli_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_slli_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // bitwise_not
+        template <class A>
+        inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
+        }
+        template <class A>
+        inline batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_si128(self, _mm_set1_epi32(-1));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_si128(self, _mm_set1_epi32(-1));
+        }
+        template <class A>
+        inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
+        }
+
+        // bitwise_or
+        template <class A>
+        inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_si128(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_si128(self, other);
+        }
+
+        template <class A>
+        inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_pd(self, other);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_pd(self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    __m128i sign_mask = _mm_set1_epi16((0xFF00 >> other) & 0x00FF);
+                    __m128i cmp_is_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self);
+                    __m128i res = _mm_srai_epi16(self, other);
+                    return _mm_or_si128(_mm_and_si128(sign_mask, cmp_is_negative), _mm_andnot_si128(sign_mask, res));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_srai_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_srai_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    // from https://github.com/samyvilar/vect/blob/master/vect_128.h
+                    return _mm_or_si128(
+                        _mm_srli_epi64(self, other),
+                        _mm_slli_epi64(
+                            _mm_srai_epi32(_mm_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), 32),
+                            64 - other));
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_and_si128(_mm_set1_epi8(0xFF >> other), _mm_srli_epi32(self, other));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_srli_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_srli_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm_srli_epi64(self, other);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+        }
+
+        // bitwise_xor
+        template <class A>
+        inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_si128(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_si128(self, other);
+        }
+
+        // bitwise_cast
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castsi128_ps(self);
+        }
+        template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
+        inline batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<sse2>) noexcept
+        {
+            return batch<Tp, A>(self.data);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castps_si128(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castsi128_pd(self);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castps_pd(self);
+        }
+        template <class A>
+        inline batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castpd_ps(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castpd_si128(self);
+        }
+
+        // broadcast
+        template <class A>
+        batch<float, A> inline broadcast(float val, requires_arch<sse2>) noexcept
+        {
+            return _mm_set1_ps(val);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> broadcast(T val, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_set1_epi8(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_set1_epi16(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_set1_epi32(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_set1_epi64x(val);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<double, A> broadcast(double val, requires_arch<sse2>) noexcept
+        {
+            return _mm_set1_pd(val);
+        }
+
+        // store_complex
+        namespace detail
+        {
+            // Override these methods in SSE-based archs, no need to override store_aligned / store_unaligned
+            // complex_low
+            template <class A>
+            inline batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
+            {
+                return _mm_unpacklo_ps(self.real(), self.imag());
+            }
+            // complex_high
+            template <class A>
+            inline batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
+            {
+                return _mm_unpackhi_ps(self.real(), self.imag());
+            }
+            template <class A>
+            inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
+            {
+                return _mm_unpacklo_pd(self.real(), self.imag());
+            }
+            template <class A>
+            inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
+            {
+                return _mm_unpackhi_pd(self.real(), self.imag());
+            }
+        }
+
+        // div
+        template <class A>
+        inline batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_div_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_div_pd(self, other);
+        }
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A>
+            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
+            {
+                return _mm_cvtepi32_ps(self);
+            }
+
+            template <class A>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<sse2>) noexcept
+            {
+                // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
+                __m128i msk_lo = _mm_set1_epi32(0xFFFF);
+                __m128 cnst65536f = _mm_set1_ps(65536.0f);
+
+                __m128i v_lo = _mm_and_si128(v, msk_lo); /* extract the 16 lowest significant bits of self                             */
+                __m128i v_hi = _mm_srli_epi32(v, 16); /* 16 most significant bits of v                                                 */
+                __m128 v_lo_flt = _mm_cvtepi32_ps(v_lo); /* No rounding                                                                */
+                __m128 v_hi_flt = _mm_cvtepi32_ps(v_hi); /* No rounding                                                                */
+                v_hi_flt = _mm_mul_ps(cnst65536f, v_hi_flt); /* No rounding                                                            */
+                return _mm_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer   */
+            }
+
+            template <class A>
+            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to sse2
+                __m128i xH = _mm_srli_epi64(x, 32);
+                xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); //  2^84
+                __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
+                __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); //  2^52
+                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
+                return _mm_add_pd(f, _mm_castsi128_pd(xL));
+            }
+
+            template <class A>
+            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to sse2
+                __m128i xH = _mm_srai_epi32(x, 16);
+                xH = _mm_and_si128(xH, _mm_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
+                xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); //  3*2^67
+                __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
+                __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); //  2^52
+                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
+                return _mm_add_pd(f, _mm_castsi128_pd(xL));
+            }
+
+            template <class A>
+            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<sse2>) noexcept
+            {
+                return _mm_cvttps_epi32(self);
+            }
+
+            template <class A>
+            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse2>) noexcept
+            {
+                __m128 mask = _mm_cmpge_ps(self, _mm_set1_ps(1u << 31));
+                __m128 lhs = _mm_castsi128_ps(_mm_cvttps_epi32(self));
+                __m128 rhs = _mm_castsi128_ps(_mm_xor_si128(
+                    _mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
+                    _mm_set1_epi32(1u << 31)));
+                return _mm_castps_si128(_mm_or_ps(_mm_and_ps(mask, rhs), _mm_andnot_ps(mask, lhs)));
+            }
+
+        }
+
+        // eq
+        template <class A>
+        inline batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpeq_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(self), _mm_castps_si128(other)));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_cmpeq_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_cmpeq_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_cmpeq_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                __m128i tmp1 = _mm_cmpeq_epi32(self, other);
+                __m128i tmp2 = _mm_shuffle_epi32(tmp1, 0xB1);
+                __m128i tmp3 = _mm_and_si128(tmp1, tmp2);
+                __m128i tmp4 = _mm_srai_epi32(tmp3, 31);
+                return _mm_shuffle_epi32(tmp4, 0xF5);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return ~(self != other);
+        }
+        template <class A>
+        inline batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpeq_pd(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other)));
+        }
+
+        // from_mask
+        template <class A>
+        inline batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
+        {
+            alignas(A::alignment()) static const uint32_t lut[][4] = {
+                { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+                { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 },
+                { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+                { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+                { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+                { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+                { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF },
+                { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+                { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+            };
+            assert(!(mask & ~0xFul) && "inbound mask");
+            return _mm_castsi128_ps(_mm_load_si128((const __m128i*)lut[mask]));
+        }
+        template <class A>
+        inline batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut[][4] = {
+                { 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+            };
+            assert(!(mask & ~0x3ul) && "inbound mask");
+            return _mm_castsi128_pd(_mm_load_si128((const __m128i*)lut[mask]));
+        }
+        template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut64[] = {
+                0x0000000000000000,
+                0x000000000000FFFF,
+                0x00000000FFFF0000,
+                0x00000000FFFFFFFF,
+                0x0000FFFF00000000,
+                0x0000FFFF0000FFFF,
+                0x0000FFFFFFFF0000,
+                0x0000FFFFFFFFFFFF,
+                0xFFFF000000000000,
+                0xFFFF00000000FFFF,
+                0xFFFF0000FFFF0000,
+                0xFFFF0000FFFFFFFF,
+                0xFFFFFFFF00000000,
+                0xFFFFFFFF0000FFFF,
+                0xFFFFFFFFFFFF0000,
+                0xFFFFFFFFFFFFFFFF,
+            };
+            alignas(A::alignment()) static const uint32_t lut32[] = {
+                0x00000000,
+                0x000000FF,
+                0x0000FF00,
+                0x0000FFFF,
+                0x00FF0000,
+                0x00FF00FF,
+                0x00FFFF00,
+                0x00FFFFFF,
+                0xFF000000,
+                0xFF0000FF,
+                0xFF00FF00,
+                0xFF00FFFF,
+                0xFFFF0000,
+                0xFFFF00FF,
+                0xFFFFFF00,
+                0xFFFFFFFF,
+            };
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                assert(!(mask & ~0xFFFF) && "inbound mask");
+                return _mm_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[mask >> 12]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                assert(!(mask & ~0xFF) && "inbound mask");
+                return _mm_set_epi64x(lut64[mask >> 4], lut64[mask & 0xF]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_castps_si128(from_mask(batch_bool<float, A> {}, mask, sse2 {}));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_castpd_si128(from_mask(batch_bool<double, A> {}, mask, sse2 {}));
+            }
+        }
+
+        // ge
+        template <class A>
+        inline batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpge_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpge_pd(self, other);
+        }
+
+        // gt
+        template <class A>
+        inline batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpgt_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_cmpgt_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_cmpgt_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_cmpgt_epi32(self, other);
+                }
+                else
+                {
+                    return gt(self, other, generic {});
+                }
+            }
+            else
+            {
+                return gt(self, other, generic {});
+            }
+        }
+
+        template <class A>
+        inline batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpgt_pd(self, other);
+        }
+
+        // haddp
+        template <class A>
+        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse2>) noexcept
+        {
+            __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]);
+            __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]);
+            __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]);
+            tmp0 = _mm_add_ps(tmp0, tmp1);
+            tmp1 = _mm_unpacklo_ps(row[2], row[3]);
+            tmp1 = _mm_add_ps(tmp1, tmp2);
+            tmp2 = _mm_movehl_ps(tmp1, tmp0);
+            tmp0 = _mm_movelh_ps(tmp0, tmp1);
+            return _mm_add_ps(tmp0, tmp2);
+        }
+        template <class A>
+        inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse2>) noexcept
+        {
+            return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]),
+                              _mm_unpackhi_pd(row[0], row[1]));
+        }
+
+        // insert
+        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_insert_epi16(self, val, I);
+            }
+            else
+            {
+                return insert(self, val, pos, generic {});
+            }
+        }
+
+        // isnan
+        template <class A>
+        inline batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpunord_ps(self, self);
+        }
+        template <class A>
+        inline batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpunord_pd(self, self);
+        }
+
+        // load_aligned
+        template <class A>
+        inline batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
+        {
+            return _mm_load_ps(mem);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
+        {
+            return _mm_load_si128((__m128i const*)mem);
+        }
+        template <class A>
+        inline batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
+        {
+            return _mm_load_pd(mem);
+        }
+
+        // load_unaligned
+        template <class A>
+        inline batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
+        {
+            return _mm_loadu_ps(mem);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
+        {
+            return _mm_loadu_si128((__m128i const*)mem);
+        }
+        template <class A>
+        inline batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
+        {
+            return _mm_loadu_pd(mem);
+        }
+
+        // load_complex
+        namespace detail
+        {
+            // Redefine these methods in the SSE-based archs if required
+            template <class A>
+            inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<sse2>) noexcept
+            {
+                return { _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)) };
+            }
+            template <class A>
+            inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<sse2>) noexcept
+            {
+                return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) };
+            }
+        }
+
+        // le
+        template <class A>
+        inline batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmple_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmple_pd(self, other);
+        }
+
+        // lt
+        template <class A>
+        inline batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmplt_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_cmplt_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_cmplt_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_cmplt_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    __m128i tmp1 = _mm_sub_epi64(self, other);
+                    __m128i tmp2 = _mm_xor_si128(self, other);
+                    __m128i tmp3 = _mm_andnot_si128(other, self);
+                    __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1);
+                    __m128i tmp5 = _mm_or_si128(tmp3, tmp4);
+                    __m128i tmp6 = _mm_srai_epi32(tmp5, 31);
+                    return _mm_shuffle_epi32(tmp6, 0xF5);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_cmplt_epi8(_mm_xor_si128(self, _mm_set1_epi8(std::numeric_limits<int8_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi8(std::numeric_limits<int8_t>::lowest())));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_cmplt_epi16(_mm_xor_si128(self, _mm_set1_epi16(std::numeric_limits<int16_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi16(std::numeric_limits<int16_t>::lowest())));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_cmplt_epi32(_mm_xor_si128(self, _mm_set1_epi32(std::numeric_limits<int32_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi32(std::numeric_limits<int32_t>::lowest())));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
+                    auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
+                    __m128i tmp1 = _mm_sub_epi64(xself, xother);
+                    __m128i tmp2 = _mm_xor_si128(xself, xother);
+                    __m128i tmp3 = _mm_andnot_si128(xother, xself);
+                    __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1);
+                    __m128i tmp5 = _mm_or_si128(tmp3, tmp4);
+                    __m128i tmp6 = _mm_srai_epi32(tmp5, 31);
+                    return _mm_shuffle_epi32(tmp6, 0xF5);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+        }
+
+        template <class A>
+        inline batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmplt_pd(self, other);
+        }
+
+        /* compression table to turn 0b10 into 0b1,
+         * 0b100010 into 0b101 etc
+         */
+        namespace detail
+        {
+            inline int mask_lut(int mask)
+            {
+                // clang-format off
+                static const int mask_lut[256] = {
+                  0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x4, 0x0, 0x5, 0x0, 0x0, 0x0, 0x0, 0x0, 0x6, 0x0, 0x7, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x8, 0x0, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0xA, 0x0, 0xB, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0xC, 0x0, 0xD, 0x0, 0x0, 0x0, 0x0, 0x0, 0xE, 0x0, 0xF, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                };
+                // clang-format on
+                return mask_lut[mask & 0xAA];
+            }
+        }
+
+        // mask
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_movemask_epi8(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                uint64_t mask8 = _mm_movemask_epi8(self);
+                return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_movemask_ps(_mm_castsi128_ps(self));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_movemask_pd(_mm_castsi128_pd(self));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline uint64_t mask(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_ps(self);
+        }
+
+        template <class A>
+        inline uint64_t mask(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_pd(self);
+        }
+
+        // max
+        template <class A>
+        inline batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_max_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return select(self > other, self, other);
+        }
+        template <class A>
+        inline batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_max_pd(self, other);
+        }
+
+        // min
+        template <class A>
+        inline batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_min_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return select(self <= other, self, other);
+        }
+        template <class A>
+        inline batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_min_pd(self, other);
+        }
+
+        // mul
+        template <class A>
+        inline batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_mul_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_mul_pd(self, other);
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                  requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtps_epi32(self);
+        }
+
+        // neg
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> neg(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return 0 - self;
+        }
+        template <class A>
+        inline batch<float, A> neg(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
+        }
+
+        template <class A>
+        inline batch<double, A> neg(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(
+                self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000)));
+        }
+
+        // neq
+        template <class A>
+        inline batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpneq_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return ~(self == other);
+        }
+        template <class A>
+        inline batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpneq_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(self.data), _mm_castsi128_ps(other.data)));
+        }
+
+        template <class A>
+        inline batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpneq_pd(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpneq_pd(self, other);
+        }
+
+        // reciprocal
+        template <class A>
+        inline batch<float, A> reciprocal(batch<float, A> const& self,
+                                          kernel::requires_arch<sse2>)
+        {
+            return _mm_rcp_ps(self);
+        }
+
+        // reduce_add
+        template <class A>
+        inline float reduce_add(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            __m128 tmp0 = _mm_add_ps(self, _mm_movehl_ps(self, self));
+            __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
+            return _mm_cvtss_f32(tmp1);
+        }
+
+        // reduce_max
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        inline T reduce_max(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
+            batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
+            batch<T, A> acc0 = max(self, step0);
+
+            constexpr auto mask1 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
+            batch<T, A> acc1 = max(acc0, step1);
+
+            constexpr auto mask2 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
+            batch<T, A> acc2 = max(acc1, step2);
+            if (sizeof(T) == 2)
+                return acc2.get(0);
+            batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
+            batch<T, A> acc3 = max(acc2, step3);
+            return acc3.get(0);
+        }
+
+        // reduce_min
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        inline T reduce_min(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
+            batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
+            batch<T, A> acc0 = min(self, step0);
+
+            constexpr auto mask1 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
+            batch<T, A> acc1 = min(acc0, step1);
+
+            constexpr auto mask2 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
+            batch<T, A> acc2 = min(acc1, step2);
+            if (sizeof(T) == 2)
+                return acc2.get(0);
+            batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
+            batch<T, A> acc3 = min(acc2, step3);
+            return acc3.get(0);
+        }
+        // TODO: move this in xsimd_generic
+        namespace detail
+        {
+            template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+            inline T hadd_default(batch<T, A> const& self, requires_arch<sse2>) noexcept
+            {
+                alignas(A::alignment()) T buffer[batch<T, A>::size];
+                self.store_aligned(buffer);
+                T res = 0;
+                for (T val : buffer)
+                {
+                    res += val;
+                }
+                return res;
+            }
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
+                __m128i tmp2 = _mm_add_epi32(self, tmp1);
+                __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01);
+                __m128i tmp4 = _mm_add_epi32(tmp2, tmp3);
+                return _mm_cvtsi128_si32(tmp4);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
+                __m128i tmp2 = _mm_add_epi64(self, tmp1);
+#if defined(__x86_64__)
+                return _mm_cvtsi128_si64(tmp2);
+#else
+                __m128i m;
+                _mm_storel_epi64(&m, tmp2);
+                int64_t i;
+                std::memcpy(&i, &m, sizeof(i));
+                return i;
+#endif
+            }
+            else
+            {
+                return detail::hadd_default(self, A {});
+            }
+        }
+        template <class A>
+        inline double reduce_add(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
+        }
+
+        // rsqrt
+        template <class A>
+        inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
+        {
+            return _mm_rsqrt_ps(val);
+        }
+        template <class A>
+        inline batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(val)));
+        }
+
+        // select
+        template <class A>
+        inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_ps(_mm_and_ps(cond, true_br), _mm_andnot_ps(cond, false_br));
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br));
+        }
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
+        {
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, sse2 {});
+        }
+        template <class A>
+        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_pd(_mm_and_pd(cond, true_br), _mm_andnot_pd(cond, false_br));
+        }
+
+        // sqrt
+        template <class A>
+        inline batch<float, A> sqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
+        {
+            return _mm_sqrt_ps(val);
+        }
+        template <class A>
+        inline batch<double, A> sqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
+        {
+            return _mm_sqrt_pd(val);
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<sse2>) noexcept
+        {
+            return _mm_slli_si128(x, N);
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<sse2>) noexcept
+        {
+            return _mm_srli_si128(x, N);
+        }
+
+        // sadd
+
+        // TODO: move this in xsimd_generic
+        namespace detail
+        {
+            template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+            inline batch<T, A> sadd_default(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+            {
+                if (std::is_signed<T>::value)
+                {
+                    auto mask = (other >> (8 * sizeof(T) - 1));
+                    auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
+                    auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
+                    return other + select(batch_bool<T, A>(mask.data), self_neg_branch, self_pos_branch);
+                }
+                else
+                {
+                    const auto diffmax = std::numeric_limits<T>::max() - self;
+                    const auto mindiff = min(diffmax, other);
+                    return self + mindiff;
+                }
+            }
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_adds_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_adds_epi16(self, other);
+                }
+                else
+                {
+                    return detail::sadd_default(self, other, A {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_adds_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_adds_epu16(self, other);
+                }
+                else
+                {
+                    return detail::sadd_default(self, other, A {});
+                }
+            }
+        }
+
+        // set
+        template <class A, class... Values>
+        inline batch<float, A> set(batch<float, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
+            return _mm_setr_ps(values...);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1) noexcept
+        {
+            return _mm_set_epi64x(v1, v0);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3) noexcept
+        {
+            return _mm_setr_epi32(v0, v1, v2, v3);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+        {
+            return _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        {
+            return _mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+        }
+
+        template <class A, class... Values>
+        inline batch<double, A> set(batch<double, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
+            return _mm_setr_pd(values...);
+        }
+
+        template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
+        }
+
+        template <class A, class... Values>
+        inline batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
+            return _mm_castsi128_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
+        }
+
+        template <class A, class... Values>
+        inline batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
+            return _mm_castsi128_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
+        }
+
+        // ssub
+        // TODO: move this in xsimd_generic
+        namespace detail
+        {
+            template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+            inline batch<T, A> ssub_default(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+            {
+                if (std::is_signed<T>::value)
+                {
+                    return sadd(self, -other);
+                }
+                else
+                {
+                    const auto diff = min(self, other);
+                    return self - diff;
+                }
+            }
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_subs_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_subs_epi16(self, other);
+                }
+                else
+                {
+                    return detail::ssub_default(self, other, A {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_subs_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_subs_epu16(self, other);
+                }
+                else
+                {
+                    return detail::ssub_default(self, other, A {});
+                }
+            }
+        }
+
+        // store_aligned
+        template <class A>
+        inline void store_aligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_store_ps(mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_aligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_store_si128((__m128i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_store_si128((__m128i*)mem, self);
+        }
+        template <class A>
+        inline void store_aligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_store_pd(mem, self);
+        }
+
+        // store_unaligned
+        template <class A>
+        inline void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_storeu_ps(mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_storeu_si128((__m128i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_storeu_si128((__m128i*)mem, self);
+        }
+        template <class A>
+        inline void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_storeu_pd(mem, self);
+        }
+
+        // sub
+        template <class A>
+        inline batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_sub_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_sub_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_sub_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_sub_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_sub_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_sub_pd(self, other);
+        }
+
+        // swizzle
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
+            return _mm_shuffle_ps(self, self, index);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t index = detail::shuffle(V0, V1);
+            return _mm_shuffle_pd(self, self, index);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
+            return _mm_shuffle_epi32(self, index);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1> mask, requires_arch<sse2>) noexcept
+        {
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, sse2 {}));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
+            return _mm_shuffle_epi32(self, index);
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
+        {
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
+        }
+
+        // zip_hi
+        template <class A>
+        inline batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_unpackhi_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_unpackhi_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_unpackhi_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_unpackhi_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_unpackhi_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_unpackhi_pd(self, other);
+        }
+
+        // zip_lo
+        template <class A>
+        inline batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_unpacklo_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_unpacklo_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_unpacklo_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_unpacklo_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_unpacklo_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_unpacklo_pd(self, other);
+        }
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp
new file mode 100644
index 0000000000..ccc049795c
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp
@@ -0,0 +1,64 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE3_HPP
+#define XSIMD_SSE3_HPP
+
+#include "../types/xsimd_sse3_register.hpp"
+#include <type_traits>
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // haddp
+        template <class A>
+        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse3>) noexcept
+        {
+            return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]),
+                               _mm_hadd_ps(row[2], row[3]));
+        }
+        template <class A>
+        inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse3>) noexcept
+        {
+            return _mm_hadd_pd(row[0], row[1]);
+        }
+
+        // load_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse3>) noexcept
+        {
+            return _mm_lddqu_si128((__m128i const*)mem);
+        }
+
+        // reduce_add
+        template <class A>
+        inline float reduce_add(batch<float, A> const& self, requires_arch<sse3>) noexcept
+        {
+            __m128 tmp0 = _mm_hadd_ps(self, self);
+            __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0);
+            return _mm_cvtss_f32(tmp1);
+        }
+        template <class A>
+        inline double reduce_add(batch<double, A> const& self, requires_arch<sse3>) noexcept
+        {
+            __m128d tmp0 = _mm_hadd_pd(self, self);
+            return _mm_cvtsd_f64(tmp0);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp
new file mode 100644
index 0000000000..c0e2878ef9
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp
@@ -0,0 +1,350 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE4_1_HPP
+#define XSIMD_SSE4_1_HPP
+
+#include <type_traits>
+
+#include "../types/xsimd_sse4_1_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+        // any
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline bool any(batch<T, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return !_mm_testz_si128(self, self);
+        }
+        // ceil
+        template <class A>
+        inline batch<float, A> ceil(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_ceil_ps(self);
+        }
+        template <class A>
+        inline batch<double, A> ceil(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_ceil_pd(self);
+        }
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A>
+            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                __m128i xH = _mm_srai_epi32(x, 16);
+                xH = _mm_blend_epi16(xH, _mm_setzero_si128(), 0x33);
+                xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); //  3*2^67
+                __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0x88); //  2^52
+                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
+                return _mm_add_pd(f, _mm_castsi128_pd(xL));
+            }
+
+            template <class A>
+            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                __m128i xH = _mm_srli_epi64(x, 32);
+                xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); //  2^84
+                __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); //  2^52
+                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
+                return _mm_add_pd(f, _mm_castsi128_pd(xL));
+            }
+
+            template <class A>
+            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse4_1>) noexcept
+            {
+                return _mm_castps_si128(
+                    _mm_blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(self)),
+                                  _mm_castsi128_ps(_mm_xor_si128(
+                                      _mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
+                                      _mm_set1_epi32(1u << 31))),
+                                  _mm_cmpge_ps(self, _mm_set1_ps(1u << 31))));
+            }
+        }
+
+        // eq
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_cmpeq_epi64(self, other);
+            }
+            else
+            {
+                return eq(self, other, ssse3 {});
+            }
+        }
+
+        // floor
+        template <class A>
+        inline batch<float, A> floor(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_floor_ps(self);
+        }
+        template <class A>
+        inline batch<double, A> floor(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_floor_pd(self);
+        }
+
+        // insert
+        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse4_1>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_insert_epi8(self, val, I);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_insert_epi32(self, val, I);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+#if (!defined(_MSC_VER) && __x86_64__) || (_MSC_VER > 1900 && defined(_M_X64))
+                return _mm_insert_epi64(self, val, I);
+#else
+                uint32_t lo, hi;
+                memcpy(&lo, (reinterpret_cast<uint32_t*>(&val)), sizeof(lo));
+                memcpy(&hi, (reinterpret_cast<uint32_t*>(&val)) + 1, sizeof(hi));
+                return _mm_insert_epi32(_mm_insert_epi32(self, lo, 2 * I), hi, 2 * I + 1);
+#endif
+            }
+            else
+            {
+                return insert(self, val, pos, ssse3 {});
+            }
+        }
+
+        // max
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_max_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_max_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_max_epi32(self, other);
+                }
+                else
+                {
+                    return max(self, other, ssse3 {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_max_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_max_epu16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_max_epu32(self, other);
+                }
+                else
+                {
+                    return max(self, other, ssse3 {});
+                }
+            }
+        }
+
+        // min
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_min_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_min_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_min_epi32(self, other);
+                }
+                else
+                {
+                    return min(self, other, ssse3 {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_min_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_min_epu16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_min_epu32(self, other);
+                }
+                else
+                {
+                    return min(self, other, ssse3 {});
+                }
+            }
+        }
+
+        // mul
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_or_si128(
+                    _mm_and_si128(_mm_mullo_epi16(self, other), _mm_srli_epi16(_mm_cmpeq_epi8(self, self), 8)),
+                    _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_epi16(self, 8), _mm_srli_epi16(other, 8)), 8));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_mullo_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_mullo_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_add_epi64(
+                    _mm_mul_epu32(self, other),
+                    _mm_slli_epi64(
+                        _mm_add_epi64(
+                            _mm_mul_epu32(other, _mm_shuffle_epi32(self, _MM_SHUFFLE(2, 3, 0, 1))),
+                            _mm_mul_epu32(self, _mm_shuffle_epi32(other, _MM_SHUFFLE(2, 3, 0, 1)))),
+                        32));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // nearbyint
+        template <class A>
+        inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
+        }
+        template <class A>
+        inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
+        }
+
+        // select
+        namespace detail
+        {
+            template <class T>
+            inline constexpr T interleave(T const& cond) noexcept
+            {
+                return (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 49) & 0x5555) | (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 48) & 0xAAAA);
+            }
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_blendv_epi8(false_br, true_br, cond);
+        }
+        template <class A>
+        inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_blendv_ps(false_br, true_br, cond);
+        }
+        template <class A>
+        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_blendv_pd(false_br, true_br, cond);
+        }
+
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_blend_epi16(false_br, true_br, mask);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                constexpr int imask = detail::interleave(mask);
+                return _mm_blend_epi16(false_br, true_br, imask);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                constexpr int imask = detail::interleave(mask);
+                constexpr int imask2 = detail::interleave(imask);
+                return _mm_blend_epi16(false_br, true_br, imask2);
+            }
+            else
+            {
+                return select(batch_bool_constant<batch<T, A>, Values...>(), true_br, false_br, ssse3 {});
+            }
+        }
+        template <class A, bool... Values>
+        inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            constexpr int mask = batch_bool_constant<batch<float, A>, Values...>::mask();
+            return _mm_blend_ps(false_br, true_br, mask);
+        }
+        template <class A, bool... Values>
+        inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            constexpr int mask = batch_bool_constant<batch<double, A>, Values...>::mask();
+            return _mm_blend_pd(false_br, true_br, mask);
+        }
+
+        // trunc
+        template <class A>
+        inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_round_ps(self, _MM_FROUND_TO_ZERO);
+        }
+        template <class A>
+        inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_round_pd(self, _MM_FROUND_TO_ZERO);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp
new file mode 100644
index 0000000000..8f9b7a76e6
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp
@@ -0,0 +1,44 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE4_2_HPP
+#define XSIMD_SSE4_2_HPP
+
+#include <limits>
+
+#include "../types/xsimd_sse4_2_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // lt
+        template <class A>
+        inline batch_bool<int64_t, A> lt(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<sse4_2>) noexcept
+        {
+            return _mm_cmpgt_epi64(other, self);
+        }
+        template <class A>
+        inline batch_bool<uint64_t, A> lt(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<sse4_2>) noexcept
+        {
+            auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
+            auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
+            return _mm_cmpgt_epi64(xother, xself);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp
new file mode 100644
index 0000000000..0aa1b2552d
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp
@@ -0,0 +1,142 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSSE3_HPP
+#define XSIMD_SSSE3_HPP
+
+#include <cstddef>
+#include <type_traits>
+
+#include "../types/xsimd_ssse3_register.hpp"
+#include "../types/xsimd_utils.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // abs
+        template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<ssse3>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_abs_epi8(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_abs_epi16(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_abs_epi32(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_abs_epi64(self);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // extract_pair
+        namespace detail
+        {
+
+            template <class T, class A>
+            inline batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& other, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
+            {
+                return other;
+            }
+
+            template <class T, class A, std::size_t I, std::size_t... Is>
+            inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (i == I)
+                {
+                    return _mm_alignr_epi8(self, other, sizeof(T) * I);
+                }
+                else
+                    return extract_pair(self, other, i, ::xsimd::detail::index_sequence<Is...>());
+            }
+        }
+
+        template <class A, class T, class _ = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<ssse3>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            assert(0 <= i && i < size && "index in bounds");
+            return detail::extract_pair(self, other, i, ::xsimd::detail::make_index_sequence<size>());
+        }
+
+        // reduce_add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<ssse3>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                __m128i tmp1 = _mm_hadd_epi16(self, self);
+                __m128i tmp2 = _mm_hadd_epi16(tmp1, tmp1);
+                __m128i tmp3 = _mm_hadd_epi16(tmp2, tmp2);
+                return _mm_cvtsi128_si32(tmp3) & 0xFFFF;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                __m128i tmp1 = _mm_hadd_epi32(self, self);
+                __m128i tmp2 = _mm_hadd_epi32(tmp1, tmp1);
+                return _mm_cvtsi128_si32(tmp2);
+            }
+            else
+            {
+                return reduce_add(self, sse3 {});
+            }
+        }
+
+        // swizzle
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<ssse3>) noexcept
+        {
+            constexpr batch_constant<batch<uint8_t, A>, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1,
+                                     2 * V4, 2 * V4 + 1, 2 * V5, 2 * V5 + 1, 2 * V6, 2 * V6 + 1, 2 * V7, 2 * V7 + 1>
+                mask8;
+            return _mm_shuffle_epi8(self, (batch<uint8_t, A>)mask8);
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
+        {
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, ssse3 {}));
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
+        {
+            return _mm_shuffle_epi8(self, (batch<uint8_t, A>)mask);
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
+        {
+            return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, ssse3 {}));
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp
new file mode 100644
index 0000000000..fa6e44e316
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp
@@ -0,0 +1,1126 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Yibo Cai                                                   *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SVE_HPP
+#define XSIMD_SVE_HPP
+
+#include <complex>
+#include <type_traits>
+
+#include "../types/xsimd_sve_register.hpp"
+
+namespace xsimd
+{
+    template <class batch_type, typename batch_type::value_type... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        namespace detail
+        {
+            using xsimd::index;
+            using xsimd::types::detail::sve_vector_type;
+
+            // predicate creation
+            inline svbool_t sve_ptrue_impl(index<1>) noexcept { return svptrue_b8(); }
+            inline svbool_t sve_ptrue_impl(index<2>) noexcept { return svptrue_b16(); }
+            inline svbool_t sve_ptrue_impl(index<4>) noexcept { return svptrue_b32(); }
+            inline svbool_t sve_ptrue_impl(index<8>) noexcept { return svptrue_b64(); }
+
+            template <class T>
+            svbool_t sve_ptrue() noexcept { return sve_ptrue_impl(index<sizeof(T)> {}); }
+
+            // count active lanes in a predicate
+            inline uint64_t sve_pcount_impl(svbool_t p, index<1>) noexcept { return svcntp_b8(p, p); }
+            inline uint64_t sve_pcount_impl(svbool_t p, index<2>) noexcept { return svcntp_b16(p, p); }
+            inline uint64_t sve_pcount_impl(svbool_t p, index<4>) noexcept { return svcntp_b32(p, p); }
+            inline uint64_t sve_pcount_impl(svbool_t p, index<8>) noexcept { return svcntp_b64(p, p); }
+
+            template <class T>
+            inline uint64_t sve_pcount(svbool_t p) noexcept { return sve_pcount_impl(p, index<sizeof(T)> {}); }
+
+            // enable for signed integers
+            template <class T>
+            using sve_enable_signed_int_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, int>::type;
+
+            // enable for unsigned integers
+            template <class T>
+            using sve_enable_unsigned_int_t = typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, int>::type;
+
+            // enable for floating points
+            template <class T>
+            using sve_enable_floating_point_t = typename std::enable_if<std::is_floating_point<T>::value, int>::type;
+
+            // enable for signed integers or floating points
+            template <class T>
+            using sve_enable_signed_int_or_floating_point_t = typename std::enable_if<std::is_signed<T>::value, int>::type;
+
+            // enable for all SVE supported types
+            template <class T>
+            using sve_enable_all_t = typename std::enable_if<std::is_arithmetic<T>::value, int>::type;
+        } // namespace detail
+
+        /*********
+         * Load *
+         *********/
+
+        namespace detail
+        {
+            // "char" is not allowed in SVE load/store operations
+            using sve_fix_char_t_impl = typename std::conditional<std::is_signed<char>::value, int8_t, uint8_t>::type;
+
+            template <class T>
+            using sve_fix_char_t = typename std::conditional<std::is_same<char, typename std::decay<T>::type>::value,
+                                                             sve_fix_char_t_impl, T>::type;
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<sve>) noexcept
+        {
+            return svld1(detail::sve_ptrue<T>(), reinterpret_cast<detail::sve_fix_char_t<T> const*>(src));
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<sve>) noexcept
+        {
+            return load_aligned<A>(src, convert<T>(), sve {});
+        }
+
+        // load_complex
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch<std::complex<T>, A> load_complex_aligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
+        {
+            const T* buf = reinterpret_cast<const T*>(mem);
+            const auto tmp = svld2(detail::sve_ptrue<T>(), buf);
+            const auto real = svget2(tmp, 0);
+            const auto imag = svget2(tmp, 1);
+            return batch<std::complex<T>, A> { real, imag };
+        }
+
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch<std::complex<T>, A> load_complex_unaligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
+        {
+            return load_complex_aligned<A>(mem, convert<std::complex<T>> {}, sve {});
+        }
+
+        /*********
+         * Store *
+         *********/
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<sve>) noexcept
+        {
+            svst1(detail::sve_ptrue<T>(), reinterpret_cast<detail::sve_fix_char_t<T>*>(dst), src);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<sve>) noexcept
+        {
+            store_aligned<A>(dst, src, sve {});
+        }
+
+        // store_complex
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
+        {
+            using v2type = typename std::conditional<(sizeof(T) == 4), svfloat32x2_t, svfloat64x2_t>::type;
+            v2type tmp {};
+            tmp = svset2(tmp, 0, src.real());
+            tmp = svset2(tmp, 1, src.imag());
+            T* buf = reinterpret_cast<T*>(dst);
+            svst2(detail::sve_ptrue<T>(), buf, tmp);
+        }
+
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline void store_complex_unaligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
+        {
+            store_complex_aligned(dst, src, sve {});
+        }
+
+        /******************
+         * scatter/gather *
+         ******************/
+
+        namespace detail
+        {
+            template <class T, class U>
+            using sve_enable_sg_t = typename std::enable_if<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>::type;
+        }
+
+        // scatter
+        template <class A, class T, class U, detail::sve_enable_sg_t<T, U> = 0>
+        inline void scatter(batch<T, A> const& src, T* dst, batch<U, A> const& index, kernel::requires_arch<sve>) noexcept
+        {
+            svst1_scatter_index(detail::sve_ptrue<T>(), dst, index.data, src.data);
+        }
+
+        // gather
+        template <class A, class T, class U, detail::sve_enable_sg_t<T, U> = 0>
+        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index, kernel::requires_arch<sve>) noexcept
+        {
+            return svld1_gather_index(detail::sve_ptrue<T>(), src, index.data);
+        }
+
+        /********************
+         * Scalar to vector *
+         ********************/
+
+        // broadcast
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_u8(uint8_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_s8(int8_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_u16(uint16_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_s16(int16_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_u32(uint32_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_s32(int32_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_u64(uint64_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_s64(int64_t(arg));
+        }
+
+        template <class A>
+        inline batch<float, A> broadcast(float arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_f32(arg);
+        }
+
+        template <class A>
+        inline batch<double, A> broadcast(double arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_f64(arg);
+        }
+
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch<T, A> broadcast(T val, requires_arch<sve>) noexcept
+        {
+            return broadcast<sve>(val, sve {});
+        }
+
+        /**************
+         * Arithmetic *
+         **************/
+
+        // add
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svadd_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // sadd
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svqadd(lhs, rhs);
+        }
+
+        // sub
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svsub_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // ssub
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svqsub(lhs, rhs);
+        }
+
+        // mul
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svmul_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // div
+        template <class A, class T, typename std::enable_if<sizeof(T) >= 4, int>::type = 0>
+        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svdiv_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // max
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svmax_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // min
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svmin_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // neg
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u8(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s8(arg)));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u16(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s16(arg)));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u32(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s32(arg)));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u64(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s64(arg)));
+        }
+
+        template <class A, class T, detail::sve_enable_signed_int_or_floating_point_t<T> = 0>
+        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svneg_x(detail::sve_ptrue<T>(), arg);
+        }
+
+        // abs
+        template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
+        inline batch<T, A> abs(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return arg;
+        }
+
+        template <class A, class T, detail::sve_enable_signed_int_or_floating_point_t<T> = 0>
+        inline batch<T, A> abs(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svabs_x(detail::sve_ptrue<T>(), arg);
+        }
+
+        // fma: x * y + z
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        {
+            return svmad_x(detail::sve_ptrue<T>(), x, y, z);
+        }
+
+        // fnma: z - x * y
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        {
+            return svmsb_x(detail::sve_ptrue<T>(), x, y, z);
+        }
+
+        // fms: x * y - z
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        {
+            return -fnma(x, y, z, sve {});
+        }
+
+        // fnms: - x * y - z
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        {
+            return -fma(x, y, z, sve {});
+        }
+
+        /**********************
+         * Logical operations *
+         **********************/
+
+        // bitwise_and
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svand_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A>
+        inline batch<float, A> bitwise_and(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u32(lhs);
+            const auto rhs_bits = svreinterpret_u32(rhs);
+            const auto result_bits = svand_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        inline batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u64(lhs);
+            const auto rhs_bits = svreinterpret_u64(rhs);
+            const auto result_bits = svand_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svand_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // bitwise_andnot
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svbic_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A>
+        inline batch<float, A> bitwise_andnot(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u32(lhs);
+            const auto rhs_bits = svreinterpret_u32(rhs);
+            const auto result_bits = svbic_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        inline batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u64(lhs);
+            const auto rhs_bits = svreinterpret_u64(rhs);
+            const auto result_bits = svbic_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svbic_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // bitwise_or
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svorr_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A>
+        inline batch<float, A> bitwise_or(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u32(lhs);
+            const auto rhs_bits = svreinterpret_u32(rhs);
+            const auto result_bits = svorr_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        inline batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u64(lhs);
+            const auto rhs_bits = svreinterpret_u64(rhs);
+            const auto result_bits = svorr_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svorr_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // bitwise_xor
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return sveor_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A>
+        inline batch<float, A> bitwise_xor(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u32(lhs);
+            const auto rhs_bits = svreinterpret_u32(rhs);
+            const auto result_bits = sveor_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        inline batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u64(lhs);
+            const auto rhs_bits = svreinterpret_u64(rhs);
+            const auto result_bits = sveor_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // bitwise_not
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svnot_x(detail::sve_ptrue<T>(), arg);
+        }
+
+        template <class A>
+        inline batch<float, A> bitwise_not(batch<float, A> const& arg, requires_arch<sve>) noexcept
+        {
+            const auto arg_bits = svreinterpret_u32(arg);
+            const auto result_bits = svnot_x(detail::sve_ptrue<float>(), arg_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        inline batch<double, A> bitwise_not(batch<double, A> const& arg, requires_arch<sve>) noexcept
+        {
+            const auto arg_bits = svreinterpret_u64(arg);
+            const auto result_bits = svnot_x(detail::sve_ptrue<double>(), arg_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svnot_z(detail::sve_ptrue<T>(), arg);
+        }
+
+        /**********
+         * Shifts *
+         **********/
+
+        namespace detail
+        {
+            template <class A, class T, class U>
+            inline batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<1>) noexcept
+            {
+                return svreinterpret_u8(arg);
+            }
+
+            template <class A, class T, class U>
+            inline batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<2>) noexcept
+            {
+                return svreinterpret_u16(arg);
+            }
+
+            template <class A, class T, class U>
+            inline batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<4>) noexcept
+            {
+                return svreinterpret_u32(arg);
+            }
+
+            template <class A, class T, class U>
+            inline batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<8>) noexcept
+            {
+                return svreinterpret_u64(arg);
+            }
+
+            template <class A, class T, class U = as_unsigned_integer_t<T>>
+            inline batch<U, A> sve_to_unsigned_batch(batch<T, A> const& arg) noexcept
+            {
+                return sve_to_unsigned_batch_impl<A, T, U>(arg, index<sizeof(T)> {});
+            }
+        } // namespace detail
+
+        // bitwise_lshift
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
+            return svlsl_x(detail::sve_ptrue<T>(), arg, n);
+        }
+
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svlsl_x(detail::sve_ptrue<T>(), lhs, detail::sve_to_unsigned_batch<A, T>(rhs));
+        }
+
+        // bitwise_rshift
+        template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
+            return svlsr_x(detail::sve_ptrue<T>(), arg, static_cast<T>(n));
+        }
+
+        template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svlsr_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A, class T, detail::sve_enable_signed_int_t<T> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
+            return svasr_x(detail::sve_ptrue<T>(), arg, static_cast<as_unsigned_integer_t<T>>(n));
+        }
+
+        template <class A, class T, detail::sve_enable_signed_int_t<T> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svasr_x(detail::sve_ptrue<T>(), lhs, detail::sve_to_unsigned_batch<A, T>(rhs));
+        }
+
+        /**************
+         * Reductions *
+         **************/
+
+        // reduce_add
+        template <class A, class T, class V = typename batch<T, A>::value_type, detail::sve_enable_all_t<T> = 0>
+        inline V reduce_add(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            // sve integer reduction results are promoted to 64 bits
+            return static_cast<V>(svaddv(detail::sve_ptrue<T>(), arg));
+        }
+
+        // reduce_max
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline T reduce_max(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svmaxv(detail::sve_ptrue<T>(), arg);
+        }
+
+        // reduce_min
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline T reduce_min(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svminv(detail::sve_ptrue<T>(), arg);
+        }
+
+        // haddp
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch<T, A> haddp(const batch<T, A>* row, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            T sums[size];
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                sums[i] = reduce_add(row[i], sve {});
+            }
+            return svld1(detail::sve_ptrue<T>(), sums);
+        }
+
+        /***************
+         * Comparisons *
+         ***************/
+
+        // eq
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmpeq(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto neq_result = sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
+            return svnot_z(detail::sve_ptrue<T>(), neq_result);
+        }
+
+        // neq
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> neq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmpne(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // lt
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmplt(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // le
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmple(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // gt
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmpgt(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // ge
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmpge(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        /***************
+         * Permutation *
+         ***************/
+
+        // swizzle
+        template <class A, class T, class I, I... idx>
+        inline batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<batch<I, A>, idx...>, requires_arch<sve>) noexcept
+        {
+            static_assert(batch<T, A>::size == sizeof...(idx), "invalid swizzle indices");
+            const batch<I, A> indices { idx... };
+            return svtbl(arg, indices);
+        }
+
+        template <class A, class T, class I, I... idx>
+        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self,
+                                                 batch_constant<batch<I, A>, idx...>,
+                                                 requires_arch<sve>) noexcept
+        {
+            const auto real = swizzle(self.real(), batch_constant<batch<I, A>, idx...> {}, sve {});
+            const auto imag = swizzle(self.imag(), batch_constant<batch<I, A>, idx...> {}, sve {});
+            return batch<std::complex<T>>(real, imag);
+        }
+
+        /*************
+         * Selection *
+         *************/
+
+        // extract_pair
+        namespace detail
+        {
+            template <class A, class T>
+            inline batch<T, A> sve_extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
+            {
+                assert(false && "extract_pair out of bounds");
+                return batch<T, A> {};
+            }
+
+            template <class A, class T, size_t I, size_t... Is>
+            inline batch<T, A> sve_extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return svext(rhs, lhs, I);
+                }
+                else
+                {
+                    return sve_extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t... Is>
+            inline batch<T, A> sve_extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) noexcept
+            {
+                if (n == 0)
+                {
+                    return rhs;
+                }
+                else
+                {
+                    return sve_extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            assert(n < size && "index in bounds");
+            return detail::sve_extract_pair_impl(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
+        }
+
+        // select
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<sve>) noexcept
+        {
+            return svsel(cond, a, b);
+        }
+
+        template <class A, class T, bool... b>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sve>) noexcept
+        {
+            return select(batch_bool<T, A> { b... }, true_br, false_br, sve {});
+        }
+
+        // zip_lo
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svzip1(lhs, rhs);
+        }
+
+        // zip_hi
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svzip2(lhs, rhs);
+        }
+
+        /*****************************
+         * Floating-point arithmetic *
+         *****************************/
+
+        // rsqrt
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch<T, A> rsqrt(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svrsqrte(arg);
+        }
+
+        // sqrt
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch<T, A> sqrt(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svsqrt_x(detail::sve_ptrue<T>(), arg);
+        }
+
+        // reciprocal
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch<T, A> reciprocal(const batch<T, A>& arg, requires_arch<sve>) noexcept
+        {
+            return svrecpe(arg);
+        }
+
+        /******************************
+         * Floating-point conversions *
+         ******************************/
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A, class T, detail::enable_sized_integral_t<T, 4> = 0>
+            inline batch<float, A> fast_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
+            {
+                return svcvt_f32_x(detail::sve_ptrue<T>(), arg);
+            }
+
+            template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+            inline batch<double, A> fast_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
+            {
+                return svcvt_f64_x(detail::sve_ptrue<T>(), arg);
+            }
+
+            template <class A>
+            inline batch<int32_t, A> fast_cast(batch<float, A> const& arg, batch<int32_t, A> const&, requires_arch<sve>) noexcept
+            {
+                return svcvt_s32_x(detail::sve_ptrue<float>(), arg);
+            }
+
+            template <class A>
+            inline batch<uint32_t, A> fast_cast(batch<float, A> const& arg, batch<uint32_t, A> const&, requires_arch<sve>) noexcept
+            {
+                return svcvt_u32_x(detail::sve_ptrue<float>(), arg);
+            }
+
+            template <class A>
+            inline batch<int64_t, A> fast_cast(batch<double, A> const& arg, batch<int64_t, A> const&, requires_arch<sve>) noexcept
+            {
+                return svcvt_s64_x(detail::sve_ptrue<double>(), arg);
+            }
+
+            template <class A>
+            inline batch<uint64_t, A> fast_cast(batch<double, A> const& arg, batch<uint64_t, A> const&, requires_arch<sve>) noexcept
+            {
+                return svcvt_u64_x(detail::sve_ptrue<double>(), arg);
+            }
+        }
+
+        /*********
+         * Miscs *
+         *********/
+
+        // set
+        template <class A, class T, class... Args>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<sve>, Args... args) noexcept
+        {
+            return detail::sve_vector_type<T> { args... };
+        }
+
+        template <class A, class T, class... Args>
+        inline batch<std::complex<T>, A> set(batch<std::complex<T>, A> const&, requires_arch<sve>,
+                                             Args... args_complex) noexcept
+        {
+            return batch<std::complex<T>>(detail::sve_vector_type<T> { args_complex.real()... },
+                                          detail::sve_vector_type<T> { args_complex.imag()... });
+        }
+
+        template <class A, class T, class... Args>
+        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sve>, Args... args) noexcept
+        {
+            using U = as_unsigned_integer_t<T>;
+            const auto values = detail::sve_vector_type<U> { static_cast<U>(args)... };
+            const auto zero = broadcast<A, U>(static_cast<U>(0), sve {});
+            return svcmpne(detail::sve_ptrue<T>(), values, zero);
+        }
+
+        // insert
+        namespace detail
+        {
+            // generate index sequence (iota)
+            inline svuint8_t sve_iota_impl(index<1>) noexcept { return svindex_u8(0, 1); }
+            inline svuint16_t sve_iota_impl(index<2>) noexcept { return svindex_u16(0, 1); }
+            inline svuint32_t sve_iota_impl(index<4>) noexcept { return svindex_u32(0, 1); }
+            inline svuint64_t sve_iota_impl(index<8>) noexcept { return svindex_u64(0, 1); }
+
+            template <class T, class V = sve_vector_type<as_unsigned_integer_t<T>>>
+            inline V sve_iota() noexcept { return sve_iota_impl(index<sizeof(T)> {}); }
+        } // namespace detail
+
+        template <class A, class T, size_t I, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> insert(batch<T, A> const& arg, T val, index<I>, requires_arch<sve>) noexcept
+        {
+            // create a predicate with only the I-th lane activated
+            const auto iota = detail::sve_iota<T>();
+            const auto index_predicate = svcmpeq(detail::sve_ptrue<T>(), iota, static_cast<as_unsigned_integer_t<T>>(I));
+            return svsel(index_predicate, broadcast<A, T>(val, sve {}), arg);
+        }
+
+        // all
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return detail::sve_pcount<T>(arg) == batch_bool<T, A>::size;
+        }
+
+        // any
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svptest_any(arg, arg);
+        }
+
+        // bitwise_cast
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 1> = 0>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u8(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 1> = 0>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_s8(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 2> = 0>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u16(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 2> = 0>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_s16(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 4> = 0>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u32(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 4> = 0>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_s32(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 8> = 0>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u64(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 8> = 0>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_s64(arg);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<float, A> bitwise_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_f32(arg);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_f64(arg);
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in, detail::sve_enable_all_t<T_in> = 0>
+        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& arg, batch_bool<T_out, A> const&, requires_arch<sve>) noexcept
+        {
+            return arg.data;
+        }
+
+        // from_bool
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return select(arg, batch<T, A>(1), batch<T, A>(0));
+        }
+
+        // slide_left
+        namespace detail
+        {
+            template <size_t N>
+            struct sve_slider_left
+            {
+                template <class A, class T>
+                inline batch<T, A> operator()(batch<T, A> const& arg) noexcept
+                {
+                    using u8_vector = batch<uint8_t, A>;
+                    const auto left = svdup_n_u8(0);
+                    const auto right = bitwise_cast(arg, u8_vector {}, sve {}).data;
+                    const u8_vector result(svext(left, right, u8_vector::size - N));
+                    return bitwise_cast(result, batch<T, A> {}, sve {});
+                }
+            };
+
+            template <>
+            struct sve_slider_left<0>
+            {
+                template <class A, class T>
+                inline batch<T, A> operator()(batch<T, A> const& arg) noexcept
+                {
+                    return arg;
+                }
+            };
+        } // namespace detail
+
+        template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> slide_left(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return detail::sve_slider_left<N>()(arg);
+        }
+
+        // slide_right
+        namespace detail
+        {
+            template <size_t N>
+            struct sve_slider_right
+            {
+                template <class A, class T>
+                inline batch<T, A> operator()(batch<T, A> const& arg) noexcept
+                {
+                    using u8_vector = batch<uint8_t, A>;
+                    const auto left = bitwise_cast(arg, u8_vector {}, sve {}).data;
+                    const auto right = svdup_n_u8(0);
+                    const u8_vector result(svext(left, right, N));
+                    return bitwise_cast(result, batch<T, A> {}, sve {});
+                }
+            };
+
+            template <>
+            struct sve_slider_right<batch<uint8_t, sve>::size>
+            {
+                template <class A, class T>
+                inline batch<T, A> operator()(batch<T, A> const&) noexcept
+                {
+                    return batch<T, A> {};
+                }
+            };
+        } // namespace detail
+
+        template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> slide_right(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return detail::sve_slider_right<N>()(arg);
+        }
+
+        // isnan
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch_bool<T, A> isnan(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return !(arg == arg);
+        }
+
+        // nearbyint
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch<T, A> nearbyint(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svrintx_x(detail::sve_ptrue<T>(), arg);
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& arg, requires_arch<sve>) noexcept
+        {
+            const auto nearest = svrinta_x(detail::sve_ptrue<float>(), arg);
+            return svcvt_s32_x(detail::sve_ptrue<float>(), nearest);
+        }
+
+        template <class A>
+        inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& arg, requires_arch<sve>) noexcept
+        {
+            const auto nearest = svrinta_x(detail::sve_ptrue<double>(), arg);
+            return svcvt_s64_x(detail::sve_ptrue<double>(), nearest);
+        }
+
+        // ldexp
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& exp, requires_arch<sve>) noexcept
+        {
+            return svscale_x(detail::sve_ptrue<T>(), x, exp);
+        }
+
+    } // namespace kernel
+} // namespace xsimd
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp b/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp
new file mode 100644
index 0000000000..6d8e021a20
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp
@@ -0,0 +1,249 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_ARCH_HPP
+#define XSIMD_ARCH_HPP
+
+#include <initializer_list>
+#include <type_traits>
+#include <utility>
+
+#include "../types/xsimd_all_registers.hpp"
+#include "./xsimd_config.hpp"
+#include "./xsimd_cpuid.hpp"
+
+namespace xsimd
+{
+
+    namespace detail
+    {
+        // Checks whether T appears in Tys.
+        template <class T, class... Tys>
+        struct contains;
+
+        template <class T>
+        struct contains<T> : std::false_type
+        {
+        };
+
+        template <class T, class Ty, class... Tys>
+        struct contains<T, Ty, Tys...>
+            : std::conditional<std::is_same<Ty, T>::value, std::true_type,
+                               contains<T, Tys...>>::type
+        {
+        };
+
+        template <class... Archs>
+        struct is_sorted;
+
+        template <>
+        struct is_sorted<> : std::true_type
+        {
+        };
+
+        template <class Arch>
+        struct is_sorted<Arch> : std::true_type
+        {
+        };
+
+        template <class A0, class A1, class... Archs>
+        struct is_sorted<A0, A1, Archs...>
+            : std::conditional<(A0::version() >= A1::version()), is_sorted<Archs...>,
+                               std::false_type>::type
+        {
+        };
+
+        template <typename T>
+        inline constexpr T max_of(T value) noexcept
+        {
+            return value;
+        }
+
+        template <typename T, typename... Ts>
+        inline constexpr T max_of(T head0, T head1, Ts... tail) noexcept
+        {
+            return max_of((head0 > head1 ? head0 : head1), tail...);
+        }
+
+    } // namespace detail
+
+    // An arch_list is a list of architectures, sorted by version number.
+    template <class... Archs>
+    struct arch_list
+    {
+#ifndef NDEBUG
+        static_assert(detail::is_sorted<Archs...>::value,
+                      "architecture list must be sorted by version");
+#endif
+
+        template <class Arch>
+        using add = arch_list<Archs..., Arch>;
+
+        template <class... OtherArchs>
+        using extend = arch_list<Archs..., OtherArchs...>;
+
+        template <class Arch>
+        static constexpr bool contains() noexcept
+        {
+            return detail::contains<Arch, Archs...>::value;
+        }
+
+        template <class F>
+        static void for_each(F&& f) noexcept
+        {
+            (void)std::initializer_list<bool> { (f(Archs {}), true)... };
+        }
+
+        static constexpr std::size_t alignment() noexcept
+        {
+            // all alignments are a power of two
+            return detail::max_of(Archs::alignment()..., static_cast<size_t>(0));
+        }
+    };
+
+    struct unavailable
+    {
+        static constexpr bool supported() noexcept { return false; }
+        static constexpr bool available() noexcept { return false; }
+        static constexpr unsigned version() noexcept { return 0; }
+        static constexpr std::size_t alignment() noexcept { return 0; }
+        static constexpr bool requires_alignment() noexcept { return false; }
+        static constexpr char const* name() noexcept { return "<none>"; }
+    };
+
+    namespace detail
+    {
+        // Pick the best architecture in arch_list L, which is the last
+        // because architectures are sorted by version.
+        template <class L>
+        struct best;
+
+        template <>
+        struct best<arch_list<>>
+        {
+            using type = unavailable;
+        };
+
+        template <class Arch, class... Archs>
+        struct best<arch_list<Arch, Archs...>>
+        {
+            using type = Arch;
+        };
+
+        // Filter archlists Archs, picking only supported archs and adding
+        // them to L.
+        template <class L, class... Archs>
+        struct supported_helper;
+
+        template <class L>
+        struct supported_helper<L, arch_list<>>
+        {
+            using type = L;
+        };
+
+        template <class L, class Arch, class... Archs>
+        struct supported_helper<L, arch_list<Arch, Archs...>>
+            : supported_helper<
+                  typename std::conditional<Arch::supported(),
+                                            typename L::template add<Arch>, L>::type,
+                  arch_list<Archs...>>
+        {
+        };
+
+        template <class... Archs>
+        struct supported : supported_helper<arch_list<>, Archs...>
+        {
+        };
+
+        // Joins all arch_list Archs in a single arch_list.
+        template <class... Archs>
+        struct join;
+
+        template <class Arch>
+        struct join<Arch>
+        {
+            using type = Arch;
+        };
+
+        template <class Arch, class... Archs, class... Args>
+        struct join<Arch, arch_list<Archs...>, Args...>
+            : join<typename Arch::template extend<Archs...>, Args...>
+        {
+        };
+    } // namespace detail
+
+    struct unsupported
+    {
+    };
+    using all_x86_architectures = arch_list<avx512bw, avx512dq, avx512cd, avx512f, fma3<avx2>, avx2, fma3<avx>, avx, fma4, fma3<sse4_2>, sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>;
+    using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>;
+    using all_arm_architectures = typename detail::join<all_sve_architectures, arch_list<neon64, neon>>::type;
+    using all_architectures = typename detail::join<all_arm_architectures, all_x86_architectures>::type;
+
+    using supported_architectures = typename detail::supported<all_architectures>::type;
+
+    using x86_arch = typename detail::best<typename detail::supported<all_x86_architectures>::type>::type;
+    using arm_arch = typename detail::best<typename detail::supported<all_arm_architectures>::type>::type;
+    // using default_arch = typename detail::best<typename detail::supported<arch_list</*arm_arch,*/ x86_arch>>::type>::type;
+    using default_arch = typename std::conditional<std::is_same<x86_arch, unavailable>::value,
+                                                   arm_arch,
+                                                   x86_arch>::type;
+
+    namespace detail
+    {
+        template <class F, class ArchList>
+        class dispatcher
+        {
+
+            const unsigned best_arch;
+            F functor;
+
+            template <class Arch, class... Tys>
+            auto walk_archs(arch_list<Arch>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
+            {
+                assert(Arch::available() && "At least one arch must be supported during dispatch");
+                return functor(Arch {}, std::forward<Tys>(args)...);
+            }
+
+            template <class Arch, class ArchNext, class... Archs, class... Tys>
+            auto walk_archs(arch_list<Arch, ArchNext, Archs...>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
+            {
+                if (Arch::version() <= best_arch)
+                    return functor(Arch {}, std::forward<Tys>(args)...);
+                else
+                    return walk_archs(arch_list<ArchNext, Archs...> {}, std::forward<Tys>(args)...);
+            }
+
+        public:
+            dispatcher(F f) noexcept
+                : best_arch(available_architectures().best)
+                , functor(f)
+            {
+            }
+
+            template <class... Tys>
+            auto operator()(Tys&&... args) noexcept -> decltype(functor(default_arch {}, std::forward<Tys>(args)...))
+            {
+                return walk_archs(ArchList {}, std::forward<Tys>(args)...);
+            }
+        };
+    }
+
+    // Generic function dispatch, à la ifunc
+    template <class ArchList = supported_architectures, class F>
+    inline detail::dispatcher<F, ArchList> dispatch(F&& f) noexcept
+    {
+        return { std::forward<F>(f) };
+    }
+
+} // namespace xsimd
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/config/xsimd_config.hpp b/third_party/xsimd/include/xsimd/config/xsimd_config.hpp
new file mode 100644
index 0000000000..b4857d8d12
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/config/xsimd_config.hpp
@@ -0,0 +1,350 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_CONFIG_HPP
+#define XSIMD_CONFIG_HPP
+
+#define XSIMD_VERSION_MAJOR 10
+#define XSIMD_VERSION_MINOR 0
+#define XSIMD_VERSION_PATCH 0
+
+/**
+ * high level free functions
+ *
+ * @defgroup xsimd_config_macro Instruction Set Detection
+ */
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSE2 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSE2__
+#define XSIMD_WITH_SSE2 1
+#else
+#define XSIMD_WITH_SSE2 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSE3 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSE3__
+#define XSIMD_WITH_SSE3 1
+#else
+#define XSIMD_WITH_SSE3 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSSE3 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSSE3__
+#define XSIMD_WITH_SSSE3 1
+#else
+#define XSIMD_WITH_SSSE3 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSE4.1 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSE4_1__
+#define XSIMD_WITH_SSE4_1 1
+#else
+#define XSIMD_WITH_SSE4_1 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSE4.2 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSE4_2__
+#define XSIMD_WITH_SSE4_2 1
+#else
+#define XSIMD_WITH_SSE4_2 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX__
+#define XSIMD_WITH_AVX 1
+#else
+#define XSIMD_WITH_AVX 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX2 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX2__
+#define XSIMD_WITH_AVX2 1
+#else
+#define XSIMD_WITH_AVX2 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if FMA3 for SSE is available at compile-time, to 0 otherwise.
+ */
+#ifdef __FMA__
+
+#if defined(__SSE__)
+#ifndef XSIMD_WITH_FMA3_SSE // Leave the opportunity to manually disable it, see #643
+#define XSIMD_WITH_FMA3_SSE 1
+#endif
+#else
+
+#if XSIMD_WITH_FMA3_SSE
+#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_SSE 0
+#endif
+
+#else
+
+#if XSIMD_WITH_FMA3_SSE
+#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_SSE 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if FMA3 for AVX is available at compile-time, to 0 otherwise.
+ */
+#ifdef __FMA__
+
+#if defined(__AVX__)
+#ifndef XSIMD_WITH_FMA3_AVX // Leave the opportunity to manually disable it, see #643
+#define XSIMD_WITH_FMA3_AVX 1
+#endif
+#else
+
+#if XSIMD_WITH_FMA3_AVX
+#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_AVX 0
+#endif
+
+#if defined(__AVX2__)
+#ifndef XSIMD_WITH_FMA3_AVX2 // Leave the opportunity to manually disable it, see #643
+#define XSIMD_WITH_FMA3_AVX2 1
+#endif
+#else
+
+#if XSIMD_WITH_FMA3_AVX2
+#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_AVX2 0
+#endif
+
+#else
+
+#if XSIMD_WITH_FMA3_AVX
+#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
+#endif
+
+#if XSIMD_WITH_FMA3_AVX2
+#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_AVX 0
+#define XSIMD_WITH_FMA3_AVX2 0
+
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if FMA4 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __FMA4__
+#define XSIMD_WITH_FMA4 1
+#else
+#define XSIMD_WITH_FMA4 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512F is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512F__
+// AVX512 instructions are supported starting with gcc 6
+// see https://www.gnu.org/software/gcc/gcc-6/changes.html
+// check clang first, newer clang always defines __GNUC__ = 4
+#if defined(__clang__) && __clang_major__ >= 6
+#define XSIMD_WITH_AVX512F 1
+#elif defined(__GNUC__) && __GNUC__ < 6
+#define XSIMD_WITH_AVX512F 0
+#else
+#define XSIMD_WITH_AVX512F 1
+#if __GNUC__ == 6
+#define XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY 1
+#endif
+#endif
+#else
+#define XSIMD_WITH_AVX512F 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512CD is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512CD__
+// Avoids repeating the GCC workaround over and over
+#define XSIMD_WITH_AVX512CD XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512CD 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512DQ is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512DQ__
+#define XSIMD_WITH_AVX512DQ XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512DQ 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512BW is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512BW__
+#define XSIMD_WITH_AVX512BW XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512BW 0
+#endif
+
+#ifdef __ARM_NEON
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if NEON is available at compile-time, to 0 otherwise.
+ */
+#if __ARM_ARCH >= 7
+#define XSIMD_WITH_NEON 1
+#else
+#define XSIMD_WITH_NEON 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if NEON64 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __aarch64__
+#define XSIMD_WITH_NEON64 1
+#else
+#define XSIMD_WITH_NEON64 0
+#endif
+#else
+#define XSIMD_WITH_NEON 0
+#define XSIMD_WITH_NEON64 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SVE is available and bit width is pre-set at compile-time, to 0 otherwise.
+ */
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
+#define XSIMD_WITH_SVE 1
+#define XSIMD_SVE_BITS __ARM_FEATURE_SVE_BITS
+#else
+#define XSIMD_WITH_SVE 0
+#define XSIMD_SVE_BITS 0
+#endif
+
+// Workaround for MSVC compiler
+#ifdef _MSC_VER
+
+#if XSIMD_WITH_AVX512
+
+#undef XSIMD_WITH_AVX2
+#define XSIMD_WITH_AVX2 1
+
+#endif
+
+#if XSIMD_WITH_AVX2
+
+#undef XSIMD_WITH_AVX
+#define XSIMD_WITH_AVX 1
+
+#undef XSIMD_WITH_FMA3_AVX
+#define XSIMD_WITH_FMA3_AVX 1
+
+#undef XSIMD_WITH_FMA3_AVX2
+#define XSIMD_WITH_FMA3_AVX2 1
+
+#endif
+
+#if XSIMD_WITH_AVX
+
+#undef XSIMD_WITH_SSE4_2
+#define XSIMD_WITH_SSE4_2 1
+
+#endif
+
+#if XSIMD_WITH_SSE4_2
+
+#undef XSIMD_WITH_SSE4_1
+#define XSIMD_WITH_SSE4_1 1
+
+#endif
+
+#if XSIMD_WITH_SSE4_1
+
+#undef XSIMD_WITH_SSSE3
+#define XSIMD_WITH_SSSE3 1
+
+#endif
+
+#if XSIMD_WITH_SSSE3
+
+#undef XSIMD_WITH_SSE3
+#define XSIMD_WITH_SSE3 1
+
+#endif
+
+#if XSIMD_WITH_SSE3 || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#undef XSIMD_WITH_SSE2
+#define XSIMD_WITH_SSE2 1
+#endif
+
+#endif
+
+#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE
+#define XSIMD_NO_SUPPORTED_ARCHITECTURE
+#endif
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp b/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp
new file mode 100644
index 0000000000..6003733705
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp
@@ -0,0 +1,181 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_CPUID_HPP
+#define XSIMD_CPUID_HPP
+
+#include <algorithm>
+#include <cstring>
+
+#if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM))
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+#endif
+
+#if defined(_MSC_VER)
+// Contains the definition of __cpuidex
+#include <intrin.h>
+#endif
+
+#include "../types/xsimd_all_registers.hpp"
+
+namespace xsimd
+{
+    namespace detail
+    {
+        struct supported_arch
+        {
+            unsigned sse2 : 1;
+            unsigned sse3 : 1;
+            unsigned ssse3 : 1;
+            unsigned sse4_1 : 1;
+            unsigned sse4_2 : 1;
+            unsigned sse4a : 1;
+            unsigned fma3_sse : 1;
+            unsigned fma4 : 1;
+            unsigned xop : 1;
+            unsigned avx : 1;
+            unsigned fma3_avx : 1;
+            unsigned avx2 : 1;
+            unsigned fma3_avx2 : 1;
+            unsigned avx512f : 1;
+            unsigned avx512cd : 1;
+            unsigned avx512dq : 1;
+            unsigned avx512bw : 1;
+            unsigned neon : 1;
+            unsigned neon64 : 1;
+
+            // version number of the best arch available
+            unsigned best;
+
+            supported_arch() noexcept
+            {
+                memset(this, 0, sizeof(supported_arch));
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+                neon = 1;
+                neon64 = 1;
+                best = neon64::version();
+#elif defined(__ARM_NEON) || defined(_M_ARM)
+
+#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
+                neon = bool(getauxval(AT_HWCAP) & HWCAP_NEON);
+#else
+                // that's very conservative :-/
+                neon = 0;
+#endif
+                neon64 = 0;
+                best = neon::version() * neon;
+
+#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
+                auto get_cpuid = [](int reg[4], int func_id) noexcept
+                {
+
+#if defined(_MSC_VER)
+                    __cpuidex(reg, func_id, 0);
+
+#elif defined(__INTEL_COMPILER)
+                    __cpuid(reg, func_id);
+
+#elif defined(__GNUC__) || defined(__clang__)
+
+#if defined(__i386__) && defined(__PIC__)
+                    // %ebx may be the PIC register
+                    __asm__("xchg{l}\t{%%}ebx, %1\n\t"
+                            "cpuid\n\t"
+                            "xchg{l}\t{%%}ebx, %1\n\t"
+                            : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]),
+                              "=d"(reg[3])
+                            : "a"(func_id), "c"(0));
+
+#else
+                    __asm__("cpuid\n\t"
+                            : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]),
+                              "=d"(reg[3])
+                            : "a"(func_id), "c"(0));
+#endif
+
+#else
+#error "Unsupported configuration"
+#endif
+                };
+
+                int regs[4];
+
+                get_cpuid(regs, 0x1);
+
+                sse2 = regs[3] >> 26 & 1;
+                best = std::max(best, sse2::version() * sse2);
+
+                sse3 = regs[2] >> 0 & 1;
+                best = std::max(best, sse3::version() * sse3);
+
+                ssse3 = regs[2] >> 9 & 1;
+                best = std::max(best, ssse3::version() * ssse3);
+
+                sse4_1 = regs[2] >> 19 & 1;
+                best = std::max(best, sse4_1::version() * sse4_1);
+
+                sse4_2 = regs[2] >> 20 & 1;
+                best = std::max(best, sse4_2::version() * sse4_2);
+
+                fma3_sse = regs[2] >> 12 & 1;
+                if (sse4_2)
+                    best = std::max(best, fma3<xsimd::sse4_2>::version() * fma3_sse);
+
+                get_cpuid(regs, 0x80000001);
+                fma4 = regs[2] >> 16 & 1;
+                best = std::max(best, fma4::version() * fma4);
+
+                // sse4a = regs[2] >> 6 & 1;
+                // best = std::max(best, XSIMD_X86_AMD_SSE4A_VERSION * sse4a);
+
+                // xop = regs[2] >> 11 & 1;
+                // best = std::max(best, XSIMD_X86_AMD_XOP_VERSION * xop);
+
+                avx = regs[2] >> 28 & 1;
+                best = std::max(best, avx::version() * avx);
+
+                fma3_avx = avx && fma3_sse;
+                best = std::max(best, fma3<xsimd::avx>::version() * fma3_avx);
+
+                get_cpuid(regs, 0x7);
+                avx2 = regs[1] >> 5 & 1;
+                best = std::max(best, avx2::version() * avx2);
+
+                fma3_avx2 = avx2 && fma3_sse;
+                best = std::max(best, fma3<xsimd::avx2>::version() * fma3_avx2);
+
+                avx512f = regs[1] >> 16 & 1;
+                best = std::max(best, avx512f::version() * avx512f);
+
+                avx512cd = regs[1] >> 28 & 1;
+                best = std::max(best, avx512cd::version() * avx512cd * avx512f);
+
+                avx512dq = regs[1] >> 17 & 1;
+                best = std::max(best, avx512dq::version() * avx512dq * avx512cd * avx512f);
+
+                avx512bw = regs[1] >> 30 & 1;
+                best = std::max(best, avx512bw::version() * avx512bw * avx512dq * avx512cd * avx512f);
+
+#endif
+            }
+        };
+    }
+
+    inline detail::supported_arch available_architectures() noexcept
+    {
+        static detail::supported_arch supported;
+        return supported;
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp b/third_party/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp
new file mode 100644
index 0000000000..4e65b689c5
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp
@@ -0,0 +1,719 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+
+namespace xsimd
+{
+    namespace detail
+    {
+
+        /* origin: boost/simd/arch/common/scalar/function/rem_pio2.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+#if defined(_MSC_VER)
+#define ONCE0                                       \
+    __pragma(warning(push))                         \
+        __pragma(warning(disable : 4127)) while (0) \
+            __pragma(warning(pop)) /**/
+#else
+#define ONCE0 while (0)
+#endif
+
+        /*
+         * ====================================================
+         * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+         *
+         * Developed at SunPro, a Sun Microsystems, Inc. business.
+         * Permission to use, copy, modify, and distribute this
+         * software is freely granted, provided that this notice
+         * is preserved.
+         * ====================================================
+         */
+
+#if defined(__GNUC__) && defined(__BYTE_ORDER__)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define XSIMD_LITTLE_ENDIAN
+#endif
+#elif defined(_WIN32)
+// We can safely assume that Windows is always little endian
+#define XSIMD_LITTLE_ENDIAN
+#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
+#define XSIMD_LITTLE_ENDIAN
+#endif
+
+#ifdef XSIMD_LITTLE_ENDIAN
+#define LOW_WORD_IDX 0
+#define HIGH_WORD_IDX sizeof(std::uint32_t)
+#else
+#define LOW_WORD_IDX sizeof(std::uint32_t)
+#define HIGH_WORD_IDX 0
+#endif
+
+#define GET_HIGH_WORD(i, d)                                            \
+    do                                                                 \
+    {                                                                  \
+        double f = (d);                                                \
+        std::memcpy(&(i), reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
+                    sizeof(std::uint32_t));                            \
+    }                                                                  \
+    ONCE0                                                              \
+    /**/
+
+#define GET_LOW_WORD(i, d)                                            \
+    do                                                                \
+    {                                                                 \
+        double f = (d);                                               \
+        std::memcpy(&(i), reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
+                    sizeof(std::uint32_t));                           \
+    }                                                                 \
+    ONCE0                                                             \
+    /**/
+
+#define SET_HIGH_WORD(d, v)                                      \
+    do                                                           \
+    {                                                            \
+        double f = (d);                                          \
+        std::uint32_t value = (v);                               \
+        std::memcpy(reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
+                    &value, sizeof(std::uint32_t));              \
+        (d) = f;                                                 \
+    }                                                            \
+    ONCE0                                                        \
+    /**/
+
+#define SET_LOW_WORD(d, v)                                      \
+    do                                                          \
+    {                                                           \
+        double f = (d);                                         \
+        std::uint32_t value = (v);                              \
+        std::memcpy(reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
+                    &value, sizeof(std::uint32_t));             \
+        (d) = f;                                                \
+    }                                                           \
+    ONCE0                                                       \
+    /**/
+
+        /*
+         * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
+         * double x[],y[]; int e0,nx,prec; int ipio2[];
+         *
+         * __kernel_rem_pio2 return the last three digits of N with
+         *		y = x - N*pi/2
+         * so that |y| < pi/2.
+         *
+         * The method is to compute the integer (mod 8) and fraction parts of
+         * (2/pi)*x without doing the full multiplication. In general we
+         * skip the part of the product that are known to be a huge integer (
+         * more accurately, = 0 mod 8 ). Thus the number of operations are
+         * independent of the exponent of the input.
+         *
+         * (2/pi) is represented by an array of 24-bit integers in ipio2[].
+         *
+         * Input parameters:
+         * 	x[]	The input value (must be positive) is broken into nx
+         *		pieces of 24-bit integers in double precision format.
+         *		x[i] will be the i-th 24 bit of x. The scaled exponent
+         *		of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
+         *		match x's up to 24 bits.
+         *
+         *		Example of breaking a double positive z into x[0]+x[1]+x[2]:
+         *			e0 = ilogb(z)-23
+         *			z  = scalbn(z,-e0)
+         *		for i = 0,1,2
+         *			x[i] = floor(z)
+         *			z    = (z-x[i])*2**24
+         *
+         *
+         *	y[]	ouput result in an array of double precision numbers.
+         *		The dimension of y[] is:
+         *			24-bit  precision	1
+         *			53-bit  precision	2
+         *			64-bit  precision	2
+         *			113-bit precision	3
+         *		The actual value is the sum of them. Thus for 113-bit
+         *		precison, one may have to do something like:
+         *
+         *		long double t,w,r_head, r_tail;
+         *		t = (long double)y[2] + (long double)y[1];
+         *		w = (long double)y[0];
+         *		r_head = t+w;
+         *		r_tail = w - (r_head - t);
+         *
+         *	e0	The exponent of x[0]
+         *
+         *	nx	dimension of x[]
+         *
+         *  	prec	an integer indicating the precision:
+         *			0	24  bits (single)
+         *			1	53  bits (double)
+         *			2	64  bits (extended)
+         *			3	113 bits (quad)
+         *
+         *	ipio2[]
+         *		integer array, contains the (24*i)-th to (24*i+23)-th
+         *		bit of 2/pi after binary point. The corresponding
+         *		floating value is
+         *
+         *			ipio2[i] * 2^(-24(i+1)).
+         *
+         * External function:
+         *	double scalbn(), floor();
+         *
+         *
+         * Here is the description of some local variables:
+         *
+         * 	jk	jk+1 is the initial number of terms of ipio2[] needed
+         *		in the computation. The recommended value is 2,3,4,
+         *		6 for single, double, extended,and quad.
+         *
+         * 	jz	local integer variable indicating the number of
+         *		terms of ipio2[] used.
+         *
+         *	jx	nx - 1
+         *
+         *	jv	index for pointing to the suitable ipio2[] for the
+         *		computation. In general, we want
+         *			( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
+         *		is an integer. Thus
+         *			e0-3-24*jv >= 0 or (e0-3)/24 >= jv
+         *		Hence jv = max(0,(e0-3)/24).
+         *
+         *	jp	jp+1 is the number of terms in PIo2[] needed, jp = jk.
+         *
+         * 	q[]	double array with integral value, representing the
+         *		24-bits chunk of the product of x and 2/pi.
+         *
+         *	q0	the corresponding exponent of q[0]. Note that the
+         *		exponent for q[i] would be q0-24*i.
+         *
+         *	PIo2[]	double precision array, obtained by cutting pi/2
+         *		into 24 bits chunks.
+         *
+         *	f[]	ipio2[] in floating point
+         *
+         *	iq[]	integer array by breaking up q[] in 24-bits chunk.
+         *
+         *	fq[]	final product of x*(2/pi) in fq[0],..,fq[jk]
+         *
+         *	ih	integer. If >0 it indicates q[] is >= 0.5, hence
+         *		it also indicates the *sign* of the result.
+         *
+         */
+
+        inline int32_t __kernel_rem_pio2(double* x, double* y, int32_t e0, int32_t nx, int32_t prec, const int32_t* ipio2) noexcept
+        {
+            static const int32_t init_jk[] = { 2, 3, 4, 6 }; /* initial value for jk */
+
+            static const double PIo2[] = {
+                1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
+                7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
+                5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
+                3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
+                1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
+                1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
+                2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
+                2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
+            };
+
+            static const double
+                zero
+                = 0.0,
+                one = 1.0,
+                two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
+                twon24 = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
+
+            int32_t jz, jx, jv, jp, jk, carry, n, iq[20], i, j, k, m, q0, ih;
+            double z, fw, f[20], fq[20], q[20];
+
+            /* initialize jk*/
+            jk = init_jk[prec];
+            jp = jk;
+
+            /* determine jx,jv,q0, note that 3>q0 */
+            jx = nx - 1;
+            jv = (e0 - 3) / 24;
+            if (jv < 0)
+                jv = 0;
+            q0 = e0 - 24 * (jv + 1);
+
+            /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
+            j = jv - jx;
+            m = jx + jk;
+            for (i = 0; i <= m; i++, j++)
+                f[i] = (j < 0) ? zero : (double)ipio2[j];
+
+            /* compute q[0],q[1],...q[jk] */
+            for (i = 0; i <= jk; i++)
+            {
+                for (j = 0, fw = 0.0; j <= jx; j++)
+                    fw += x[j] * f[jx + i - j];
+                q[i] = fw;
+            }
+
+            jz = jk;
+
+        recompute:
+            /* distill q[] into iq[] reversingly */
+            for (i = 0, j = jz, z = q[jz]; j > 0; i++, j--)
+            {
+                fw = (double)((int32_t)(twon24 * z));
+                iq[i] = (int)(z - two24 * fw);
+                z = q[j - 1] + fw;
+            }
+
+            /* compute n */
+            z = std::scalbn(z, q0); /* actual value of z */
+            z -= 8.0 * std::floor(z * 0.125); /* trim off integer >= 8 */
+            n = (int32_t)z;
+            z -= (double)n;
+            ih = 0;
+            if (q0 > 0)
+            { /* need iq[jz-1] to determine n */
+                i = (iq[jz - 1] >> (24 - q0));
+                n += i;
+                iq[jz - 1] -= i << (24 - q0);
+                ih = iq[jz - 1] >> (23 - q0);
+            }
+            else if (q0 == 0)
+                ih = iq[jz - 1] >> 23;
+            else if (z >= 0.5)
+                ih = 2;
+
+            if (ih > 0)
+            { /* q > 0.5 */
+                n += 1;
+                carry = 0;
+                for (i = 0; i < jz; i++)
+                { /* compute 1-q */
+                    j = iq[i];
+                    if (carry == 0)
+                    {
+                        if (j != 0)
+                        {
+                            carry = 1;
+                            iq[i] = 0x1000000 - j;
+                        }
+                    }
+                    else
+                        iq[i] = 0xffffff - j;
+                }
+                if (q0 > 0)
+                { /* rare case: chance is 1 in 12 */
+                    switch (q0)
+                    {
+                    case 1:
+                        iq[jz - 1] &= 0x7fffff;
+                        break;
+                    case 2:
+                        iq[jz - 1] &= 0x3fffff;
+                        break;
+                    }
+                }
+                if (ih == 2)
+                {
+                    z = one - z;
+                    if (carry != 0)
+                        z -= std::scalbn(one, q0);
+                }
+            }
+
+            /* check if recomputation is needed */
+            if (z == zero)
+            {
+                j = 0;
+                for (i = jz - 1; i >= jk; i--)
+                    j |= iq[i];
+                if (j == 0)
+                { /* need recomputation */
+                    for (k = 1; iq[jk - k] == 0; k++)
+                        ; /* k = no. of terms needed */
+
+                    for (i = jz + 1; i <= jz + k; i++)
+                    { /* add q[jz+1] to q[jz+k] */
+                        f[jx + i] = (double)ipio2[jv + i];
+                        for (j = 0, fw = 0.0; j <= jx; j++)
+                            fw += x[j] * f[jx + i - j];
+                        q[i] = fw;
+                    }
+                    jz += k;
+                    goto recompute;
+                }
+            }
+
+            /* chop off zero terms */
+            if (z == 0.0)
+            {
+                jz -= 1;
+                q0 -= 24;
+                while (iq[jz] == 0)
+                {
+                    jz--;
+                    q0 -= 24;
+                }
+            }
+            else
+            { /* break z into 24-bit if necessary */
+                z = std::scalbn(z, -q0);
+                if (z >= two24)
+                {
+                    fw = (double)((int32_t)(twon24 * z));
+                    iq[jz] = (int32_t)(z - two24 * fw);
+                    jz += 1;
+                    q0 += 24;
+                    iq[jz] = (int32_t)fw;
+                }
+                else
+                    iq[jz] = (int32_t)z;
+            }
+
+            /* convert integer "bit" chunk to floating-point value */
+            fw = scalbn(one, q0);
+            for (i = jz; i >= 0; i--)
+            {
+                q[i] = fw * (double)iq[i];
+                fw *= twon24;
+            }
+
+            /* compute PIo2[0,...,jp]*q[jz,...,0] */
+            for (i = jz; i >= 0; i--)
+            {
+                for (fw = 0.0, k = 0; k <= jp && k <= jz - i; k++)
+                    fw += PIo2[k] * q[i + k];
+                fq[jz - i] = fw;
+            }
+
+            /* compress fq[] into y[] */
+            switch (prec)
+            {
+            case 0:
+                fw = 0.0;
+                for (i = jz; i >= 0; i--)
+                    fw += fq[i];
+                y[0] = (ih == 0) ? fw : -fw;
+                break;
+            case 1:
+            case 2:
+                fw = 0.0;
+                for (i = jz; i >= 0; i--)
+                    fw += fq[i];
+                y[0] = (ih == 0) ? fw : -fw;
+                fw = fq[0] - fw;
+                for (i = 1; i <= jz; i++)
+                    fw += fq[i];
+                y[1] = (ih == 0) ? fw : -fw;
+                break;
+            case 3: /* painful */
+                for (i = jz; i > 0; i--)
+                {
+                    fw = fq[i - 1] + fq[i];
+                    fq[i] += fq[i - 1] - fw;
+                    fq[i - 1] = fw;
+                }
+                for (i = jz; i > 1; i--)
+                {
+                    fw = fq[i - 1] + fq[i];
+                    fq[i] += fq[i - 1] - fw;
+                    fq[i - 1] = fw;
+                }
+                for (fw = 0.0, i = jz; i >= 2; i--)
+                    fw += fq[i];
+                if (ih == 0)
+                {
+                    y[0] = fq[0];
+                    y[1] = fq[1];
+                    y[2] = fw;
+                }
+                else
+                {
+                    y[0] = -fq[0];
+                    y[1] = -fq[1];
+                    y[2] = -fw;
+                }
+            }
+            return n & 7;
+        }
+
+        inline std::int32_t __ieee754_rem_pio2(double x, double* y) noexcept
+        {
+            static const std::int32_t two_over_pi[] = {
+                0xA2F983,
+                0x6E4E44,
+                0x1529FC,
+                0x2757D1,
+                0xF534DD,
+                0xC0DB62,
+                0x95993C,
+                0x439041,
+                0xFE5163,
+                0xABDEBB,
+                0xC561B7,
+                0x246E3A,
+                0x424DD2,
+                0xE00649,
+                0x2EEA09,
+                0xD1921C,
+                0xFE1DEB,
+                0x1CB129,
+                0xA73EE8,
+                0x8235F5,
+                0x2EBB44,
+                0x84E99C,
+                0x7026B4,
+                0x5F7E41,
+                0x3991D6,
+                0x398353,
+                0x39F49C,
+                0x845F8B,
+                0xBDF928,
+                0x3B1FF8,
+                0x97FFDE,
+                0x05980F,
+                0xEF2F11,
+                0x8B5A0A,
+                0x6D1F6D,
+                0x367ECF,
+                0x27CB09,
+                0xB74F46,
+                0x3F669E,
+                0x5FEA2D,
+                0x7527BA,
+                0xC7EBE5,
+                0xF17B3D,
+                0x0739F7,
+                0x8A5292,
+                0xEA6BFB,
+                0x5FB11F,
+                0x8D5D08,
+                0x560330,
+                0x46FC7B,
+                0x6BABF0,
+                0xCFBC20,
+                0x9AF436,
+                0x1DA9E3,
+                0x91615E,
+                0xE61B08,
+                0x659985,
+                0x5F14A0,
+                0x68408D,
+                0xFFD880,
+                0x4D7327,
+                0x310606,
+                0x1556CA,
+                0x73A8C9,
+                0x60E27B,
+                0xC08C6B,
+            };
+
+            static const std::int32_t npio2_hw[] = {
+                0x3FF921FB,
+                0x400921FB,
+                0x4012D97C,
+                0x401921FB,
+                0x401F6A7A,
+                0x4022D97C,
+                0x4025FDBB,
+                0x402921FB,
+                0x402C463A,
+                0x402F6A7A,
+                0x4031475C,
+                0x4032D97C,
+                0x40346B9C,
+                0x4035FDBB,
+                0x40378FDB,
+                0x403921FB,
+                0x403AB41B,
+                0x403C463A,
+                0x403DD85A,
+                0x403F6A7A,
+                0x40407E4C,
+                0x4041475C,
+                0x4042106C,
+                0x4042D97C,
+                0x4043A28C,
+                0x40446B9C,
+                0x404534AC,
+                0x4045FDBB,
+                0x4046C6CB,
+                0x40478FDB,
+                0x404858EB,
+                0x404921FB,
+            };
+
+            /*
+             * invpio2:  53 bits of 2/pi
+             * pio2_1:   first  33 bit of pi/2
+             * pio2_1t:  pi/2 - pio2_1
+             * pio2_2:   second 33 bit of pi/2
+             * pio2_2t:  pi/2 - (pio2_1+pio2_2)
+             * pio2_3:   third  33 bit of pi/2
+             * pio2_3t:  pi/2 - (pio2_1+pio2_2+pio2_3)
+             */
+
+            static const double
+                zero
+                = 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
+                half = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
+                two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
+                invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
+                pio2_1 = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
+                pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
+                pio2_2 = 6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */
+                pio2_2t = 2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */
+                pio2_3 = 2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */
+                pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
+
+            double z = 0., w, t, r, fn;
+            double tx[3];
+            std::int32_t e0, i, j, nx, n, ix, hx;
+            std::uint32_t low;
+
+            GET_HIGH_WORD(hx, x); /* high word of x */
+            ix = hx & 0x7fffffff;
+            if (ix <= 0x3fe921fb) /* |x| ~<= pi/4 , no need for reduction */
+            {
+                y[0] = x;
+                y[1] = 0;
+                return 0;
+            }
+            if (ix < 0x4002d97c)
+            { /* |x| < 3pi/4, special case with n=+-1 */
+                if (hx > 0)
+                {
+                    z = x - pio2_1;
+                    if (ix != 0x3ff921fb)
+                    { /* 33+53 bit pi is good enough */
+                        y[0] = z - pio2_1t;
+                        y[1] = (z - y[0]) - pio2_1t;
+                    }
+                    else
+                    { /* near pi/2, use 33+33+53 bit pi */
+                        z -= pio2_2;
+                        y[0] = z - pio2_2t;
+                        y[1] = (z - y[0]) - pio2_2t;
+                    }
+                    return 1;
+                }
+                else
+                { /* negative x */
+                    z = x + pio2_1;
+                    if (ix != 0x3ff921fb)
+                    { /* 33+53 bit pi is good enough */
+                        y[0] = z + pio2_1t;
+                        y[1] = (z - y[0]) + pio2_1t;
+                    }
+                    else
+                    { /* near pi/2, use 33+33+53 bit pi */
+                        z += pio2_2;
+                        y[0] = z + pio2_2t;
+                        y[1] = (z - y[0]) + pio2_2t;
+                    }
+
+                    return -1;
+                }
+            }
+            if (ix <= 0x413921fb)
+            { /* |x| ~<= 2^19*(pi/2), medium_ size */
+                t = std::fabs(x);
+                n = (std::int32_t)(t * invpio2 + half);
+                fn = (double)n;
+                r = t - fn * pio2_1;
+                w = fn * pio2_1t; /* 1st round good to 85 bit */
+                if ((n < 32) && (n > 0) && (ix != npio2_hw[n - 1]))
+                {
+                    y[0] = r - w; /* quick check no cancellation */
+                }
+                else
+                {
+                    std::uint32_t high;
+                    j = ix >> 20;
+                    y[0] = r - w;
+                    GET_HIGH_WORD(high, y[0]);
+                    i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
+                    if (i > 16)
+                    { /* 2nd iteration needed, good to 118 */
+                        t = r;
+                        w = fn * pio2_2;
+                        r = t - w;
+                        w = fn * pio2_2t - ((t - r) - w);
+                        y[0] = r - w;
+                        GET_HIGH_WORD(high, y[0]);
+                        i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
+                        if (i > 49)
+                        { /* 3rd iteration need, 151 bits acc */
+                            t = r; /* will cover all possible cases */
+                            w = fn * pio2_3;
+                            r = t - w;
+                            w = fn * pio2_3t - ((t - r) - w);
+                            y[0] = r - w;
+                        }
+                    }
+                }
+                y[1] = (r - y[0]) - w;
+                if (hx < 0)
+                {
+                    y[0] = -y[0];
+                    y[1] = -y[1];
+                    return -n;
+                }
+                else
+                    return n;
+            }
+            /*
+             * all other (large) arguments
+             */
+            if (ix >= 0x7ff00000)
+            { /* x is inf or NaN */
+                y[0] = y[1] = x - x;
+                return 0;
+            }
+            /* set z = scalbn(|x|,ilogb(x)-23) */
+            GET_LOW_WORD(low, x);
+            SET_LOW_WORD(z, low);
+            e0 = (ix >> 20) - 1046; /* e0 = ilogb(z)-23; */
+            SET_HIGH_WORD(z, static_cast<uint32_t>(ix - (e0 << 20)));
+            for (i = 0; i < 2; i++)
+            {
+                tx[i] = (double)((std::int32_t)(z));
+                z = (z - tx[i]) * two24;
+            }
+            tx[2] = z;
+            nx = 3;
+            while (tx[nx - 1] == zero)
+                nx--; /* skip zero term */
+            n = __kernel_rem_pio2(tx, y, e0, nx, 2, two_over_pi);
+            if (hx < 0)
+            {
+                y[0] = -y[0];
+                y[1] = -y[1];
+                return -n;
+            }
+            return n;
+        }
+    }
+
+#undef XSIMD_LITTLE_ENDIAN
+#undef SET_LOW_WORD
+#undef SET_HIGH_WORD
+#undef GET_LOW_WORD
+#undef GET_HIGH_WORD
+#undef HIGH_WORD_IDX
+#undef LOW_WORD_IDX
+#undef ONCE0
+}
diff --git a/third_party/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp b/third_party/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp
new file mode 100644
index 0000000000..c9021a0c9f
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp
@@ -0,0 +1,349 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_ALIGNED_ALLOCATOR_HPP
+#define XSIMD_ALIGNED_ALLOCATOR_HPP
+
+#include <algorithm>
+#include <cstddef>
+#include <utility>
+#ifdef _WIN32
+#include <malloc.h>
+#else
+#include <cstdlib>
+#endif
+
+#include <cassert>
+#include <memory>
+
+#include "../config/xsimd_arch.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @class aligned_allocator
+     * @brief Allocator for aligned memory
+     *
+     * The aligned_allocator class template is an allocator that
+     * performs memory allocation aligned by the specified value.
+     *
+     * @tparam T type of objects to allocate.
+     * @tparam Align alignment in bytes.
+     */
+    template <class T, size_t Align = default_arch::alignment()>
+    class aligned_allocator
+    {
+    public:
+        using value_type = T;
+        using pointer = T*;
+        using const_pointer = const T*;
+        using reference = T&;
+        using const_reference = const T&;
+        using size_type = size_t;
+        using difference_type = ptrdiff_t;
+
+        static constexpr size_t alignment = Align;
+
+        template <class U>
+        struct rebind
+        {
+            using other = aligned_allocator<U, Align>;
+        };
+
+        aligned_allocator() noexcept;
+        aligned_allocator(const aligned_allocator& rhs) noexcept;
+
+        template <class U>
+        aligned_allocator(const aligned_allocator<U, Align>& rhs) noexcept;
+
+        ~aligned_allocator();
+
+        pointer address(reference) noexcept;
+        const_pointer address(const_reference) const noexcept;
+
+        pointer allocate(size_type n, const void* hint = 0);
+        void deallocate(pointer p, size_type n);
+
+        size_type max_size() const noexcept;
+        size_type size_max() const noexcept;
+
+        template <class U, class... Args>
+        void construct(U* p, Args&&... args);
+
+        template <class U>
+        void destroy(U* p);
+    };
+
+    template <class T1, size_t Align1, class T2, size_t Align2>
+    bool operator==(const aligned_allocator<T1, Align1>& lhs,
+                    const aligned_allocator<T2, Align2>& rhs) noexcept;
+
+    template <class T1, size_t Align1, class T2, size_t Align2>
+    bool operator!=(const aligned_allocator<T1, Align1>& lhs,
+                    const aligned_allocator<T2, Align2>& rhs) noexcept;
+
+    void* aligned_malloc(size_t size, size_t alignment);
+    void aligned_free(void* ptr);
+
+    template <class T>
+    size_t get_alignment_offset(const T* p, size_t size, size_t block_size);
+
+    /************************************
+     * aligned_allocator implementation *
+     ************************************/
+
+    /**
+     * Default constructor.
+     */
+    template <class T, size_t A>
+    inline aligned_allocator<T, A>::aligned_allocator() noexcept
+    {
+    }
+
+    /**
+     * Copy constructor.
+     */
+    template <class T, size_t A>
+    inline aligned_allocator<T, A>::aligned_allocator(const aligned_allocator&) noexcept
+    {
+    }
+
+    /**
+     * Extended copy constructor.
+     */
+    template <class T, size_t A>
+    template <class U>
+    inline aligned_allocator<T, A>::aligned_allocator(const aligned_allocator<U, A>&) noexcept
+    {
+    }
+
+    /**
+     * Destructor.
+     */
+    template <class T, size_t A>
+    inline aligned_allocator<T, A>::~aligned_allocator()
+    {
+    }
+
+    /**
+     * Returns the actual address of \c r even in presence of overloaded \c operator&.
+     * @param r the object to acquire address of.
+     * @return the actual address of \c r.
+     */
+    template <class T, size_t A>
+    inline auto
+    aligned_allocator<T, A>::address(reference r) noexcept -> pointer
+    {
+        return &r;
+    }
+
+    /**
+     * Returns the actual address of \c r even in presence of overloaded \c operator&.
+     * @param r the object to acquire address of.
+     * @return the actual address of \c r.
+     */
+    template <class T, size_t A>
+    inline auto
+    aligned_allocator<T, A>::address(const_reference r) const noexcept -> const_pointer
+    {
+        return &r;
+    }
+
+    /**
+     * Allocates <tt>n * sizeof(T)</tt> bytes of uninitialized memory, aligned by \c A.
+     * The alignment may require some extra memory allocation.
+     * @param n the number of objects to allocate storage for.
+     * @param hint unused parameter provided for standard compliance.
+     * @return a pointer to the first byte of a memory block suitably aligned and sufficient to
+     * hold an array of \c n objects of type \c T.
+     */
+    template <class T, size_t A>
+    inline auto
+    aligned_allocator<T, A>::allocate(size_type n, const void*) -> pointer
+    {
+        pointer res = reinterpret_cast<pointer>(aligned_malloc(sizeof(T) * n, A));
+#if defined(_CPPUNWIND) || defined(__cpp_exceptions)
+        if (res == nullptr)
+            throw std::bad_alloc();
+#endif
+        return res;
+    }
+
+    /**
+     * Deallocates the storage referenced by the pointer p, which must be a pointer obtained by
+     * an earlier call to allocate(). The argument \c n must be equal to the first argument of the call
+     * to allocate() that originally produced \c p; otherwise, the behavior is undefined.
+     * @param p pointer obtained from allocate().
+     * @param n number of objects earlier passed to allocate().
+     */
+    template <class T, size_t A>
+    inline void aligned_allocator<T, A>::deallocate(pointer p, size_type)
+    {
+        aligned_free(p);
+    }
+
+    /**
+     * Returns the maximum theoretically possible value of \c n, for which the
+     * call allocate(n, 0) could succeed.
+     * @return the maximum supported allocated size.
+     */
+    template <class T, size_t A>
+    inline auto
+    aligned_allocator<T, A>::max_size() const noexcept -> size_type
+    {
+        return size_type(-1) / sizeof(T);
+    }
+
+    /**
+     * This method is deprecated, use max_size() instead
+     */
+    template <class T, size_t A>
+    inline auto
+    aligned_allocator<T, A>::size_max() const noexcept -> size_type
+    {
+        return size_type(-1) / sizeof(T);
+    }
+
+    /**
+     * Constructs an object of type \c T in allocated uninitialized memory
+     * pointed to by \c p, using placement-new.
+     * @param p pointer to allocated uninitialized memory.
+     * @param args the constructor arguments to use.
+     */
+    template <class T, size_t A>
+    template <class U, class... Args>
+    inline void aligned_allocator<T, A>::construct(U* p, Args&&... args)
+    {
+        new ((void*)p) U(std::forward<Args>(args)...);
+    }
+
+    /**
+     * Calls the destructor of the object pointed to by \c p.
+     * @param p pointer to the object that is going to be destroyed.
+     */
+    template <class T, size_t A>
+    template <class U>
+    inline void aligned_allocator<T, A>::destroy(U* p)
+    {
+        p->~U();
+    }
+
+    /**
+     * @defgroup allocator_comparison Comparison operators
+     */
+
+    /**
+     * @ingroup allocator_comparison
+     * Compares two aligned memory allocator for equality. Since allocators
+     * are stateless, return \c true iff <tt>A1 == A2</tt>.
+     * @param lhs aligned_allocator to compare.
+     * @param rhs aligned_allocator to compare.
+     * @return true if the allocators have the same alignment.
+     */
+    template <class T1, size_t A1, class T2, size_t A2>
+    inline bool operator==(const aligned_allocator<T1, A1>& lhs,
+                           const aligned_allocator<T2, A2>& rhs) noexcept
+    {
+        return lhs.alignment == rhs.alignment;
+    }
+
+    /**
+     * @ingroup allocator_comparison
+     * Compares two aligned memory allocator for inequality. Since allocators
+     * are stateless, return \c true iff <tt>A1 != A2</tt>.
+     * @param lhs aligned_allocator to compare.
+     * @param rhs aligned_allocator to compare.
+     * @return true if the allocators have different alignments.
+     */
+    template <class T1, size_t A1, class T2, size_t A2>
+    inline bool operator!=(const aligned_allocator<T1, A1>& lhs,
+                           const aligned_allocator<T2, A2>& rhs) noexcept
+    {
+        return !(lhs == rhs);
+    }
+
+    /****************************************
+     * aligned malloc / free implementation *
+     ****************************************/
+
+    namespace detail
+    {
+        inline void* xaligned_malloc(size_t size, size_t alignment)
+        {
+            assert(((alignment & (alignment - 1)) == 0) && "alignment must be a power of two");
+            assert((alignment >= sizeof(void*)) && "alignment must be at least the size of a pointer");
+            void* res = nullptr;
+#ifdef _WIN32
+            res = _aligned_malloc(size, alignment);
+#else
+            if (posix_memalign(&res, alignment, size) != 0)
+            {
+                res = nullptr;
+            }
+#endif
+            return res;
+        }
+
+        inline void xaligned_free(void* ptr)
+        {
+#ifdef _WIN32
+            _aligned_free(ptr);
+#else
+            free(ptr);
+#endif
+        }
+    }
+
+    inline void* aligned_malloc(size_t size, size_t alignment)
+    {
+        return detail::xaligned_malloc(size, alignment);
+    }
+
+    inline void aligned_free(void* ptr)
+    {
+        detail::xaligned_free(ptr);
+    }
+
+    template <class T>
+    inline size_t get_alignment_offset(const T* p, size_t size, size_t block_size)
+    {
+        // size_t block_size = simd_traits<T>::size;
+        if (block_size == 1)
+        {
+            // The simd_block consists of exactly one scalar so that all
+            // elements of the array
+            // are "well" aligned.
+            return 0;
+        }
+        else if (size_t(p) & (sizeof(T) - 1))
+        {
+            // The array is not aligned to the size of a single element, so that
+            // no element
+            // of the array is well aligned
+            return size;
+        }
+        else
+        {
+            size_t block_mask = block_size - 1;
+            return std::min<size_t>(
+                (block_size - ((size_t(p) / sizeof(T)) & block_mask)) & block_mask,
+                size);
+        }
+    }
+
+    template <class T, class A = default_arch>
+    using default_allocator = typename std::conditional<A::requires_alignment(),
+                                                        aligned_allocator<T, A::alignment()>,
+                                                        std::allocator<T>>::type;
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/memory/xsimd_alignment.hpp b/third_party/xsimd/include/xsimd/memory/xsimd_alignment.hpp
new file mode 100644
index 0000000000..62bc068db1
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/memory/xsimd_alignment.hpp
@@ -0,0 +1,91 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_ALIGNMENT_HPP
+#define XSIMD_ALIGNMENT_HPP
+
+#include "../types/xsimd_utils.hpp"
+#include "xsimd_aligned_allocator.hpp"
+
+namespace xsimd
+{
+    /**
+     * @struct aligned_mode
+     * @brief tag for load and store of aligned memory.
+     */
+    struct aligned_mode
+    {
+    };
+
+    /**
+     * @struct unaligned_mode
+     * @brief tag for load and store of unaligned memory.
+     */
+    struct unaligned_mode
+    {
+    };
+
+    /***********************
+     * Allocator alignment *
+     ***********************/
+
+    template <class A>
+    struct allocator_alignment
+    {
+        using type = unaligned_mode;
+    };
+
+    template <class T>
+    struct allocator_alignment<aligned_allocator<T>>
+    {
+        using type = aligned_mode;
+    };
+
+    template <class A>
+    using allocator_alignment_t = typename allocator_alignment<A>::type;
+
+    /***********************
+     * container alignment *
+     ***********************/
+
+    template <class C, class = void>
+    struct container_alignment
+    {
+        using type = unaligned_mode;
+    };
+
+    template <class C>
+    struct container_alignment<C, detail::void_t<typename C::allocator_type>>
+    {
+        using type = allocator_alignment_t<typename C::allocator_type>;
+    };
+
+    template <class C>
+    using container_alignment_t = typename container_alignment<C>::type;
+
+    /*********************
+     * alignment checker *
+     *********************/
+
+    /**
+     * Checks whether pointer \c ptr is aligned according the alignment
+     * requirements of \c Arch.
+     * @return true if the alignment requirements are met
+     */
+    template <class Arch = default_arch>
+    inline bool is_aligned(void const* ptr)
+    {
+        return (reinterpret_cast<uintptr_t>(ptr) % static_cast<uintptr_t>(Arch::alignment())) == 0;
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
new file mode 100644
index 0000000000..1fe0773254
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
@@ -0,0 +1,32 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#include "xsimd_fma3_sse_register.hpp"
+#include "xsimd_fma4_register.hpp"
+#include "xsimd_sse2_register.hpp"
+#include "xsimd_sse3_register.hpp"
+#include "xsimd_sse4_1_register.hpp"
+#include "xsimd_sse4_2_register.hpp"
+
+#include "xsimd_avx2_register.hpp"
+#include "xsimd_avx_register.hpp"
+#include "xsimd_fma3_avx2_register.hpp"
+#include "xsimd_fma3_avx_register.hpp"
+
+#include "xsimd_avx512bw_register.hpp"
+#include "xsimd_avx512cd_register.hpp"
+#include "xsimd_avx512dq_register.hpp"
+#include "xsimd_avx512f_register.hpp"
+
+#include "xsimd_neon64_register.hpp"
+#include "xsimd_neon_register.hpp"
+
+#include "xsimd_sve_register.hpp"
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
new file mode 100644
index 0000000000..4baa84c39c
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
@@ -0,0 +1,2310 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_API_HPP
+#define XSIMD_API_HPP
+
+#include <complex>
+#include <cstddef>
+#include <limits>
+#include <ostream>
+
+#include "../arch/xsimd_isa.hpp"
+#include "../types/xsimd_batch.hpp"
+#include "../types/xsimd_traits.hpp"
+
+namespace xsimd
+{
+    /**
+     * high level free functions
+     *
+     * @defgroup batch_arithmetic Arithmetic operators
+     * @defgroup batch_constant Constant batches
+     * @defgroup batch_data_transfer Memory operators
+     * @defgroup batch_math Basic math operators
+     * @defgroup batch_math_extra Extra math operators
+     * @defgroup batch_fp Floating point manipulation
+     * @defgroup batch_rounding Rounding operators
+     * @defgroup batch_conversion Conversion operators
+     * @defgroup batch_complex_op Complex operators
+     * @defgroup batch_logical Logical operators
+     * @defgroup batch_bitwise Bitwise operators
+     * @defgroup batch_reducers Reducers
+     * @defgroup batch_miscellaneous Miscellaneous
+     * @defgroup batch_trigo Trigonometry
+     *
+     * @defgroup batch_bool_logical Boolean logical operators
+     * @defgroup batch_bool_reducers Boolean reducers
+     */
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the absolute values of each scalar in the batch \c x.
+     * @param x batch of integer or floating point values.
+     * @return the absolute values of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> abs(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::abs<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the absolute values of each complex in the batch \c z.
+     * @param z batch of complex values.
+     * @return the absolute values of \c z.
+     */
+    template <class T, class A>
+    inline batch<T, A> abs(batch<std::complex<T>, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::abs<A>(z, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the sum of the batches \c x and \c y.
+     * @param x batch or scalar involved in the addition.
+     * @param y batch or scalar involved in the addition.
+     * @return the sum of \c x and \c y
+     */
+    template <class T, class A>
+    inline auto add(batch<T> const& x, batch<T, A> const& y) noexcept -> decltype(x + y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x + y;
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the arc cosine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the arc cosine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> acos(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::acos<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the inverse hyperbolic cosine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the inverse hyperbolic cosine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> acosh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::acosh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the argument of the batch \c z.
+     * @param z batch of complex or real values.
+     * @return the argument of \c z.
+     */
+    template <class T, class A>
+    inline real_batch_type_t<batch<T, A>> arg(batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::arg<A>(z, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the arc sine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the arc sine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> asin(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::asin<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the inverse hyperbolic sine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the inverse hyperbolic sine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> asinh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::asinh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the arc tangent of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the arc tangent of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> atan(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::atan<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the arc tangent of the batch \c x/y, using the signs of the
+     * arguments to determine the correct quadrant.
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return the arc tangent of \c x/y.
+     */
+    template <class T, class A>
+    inline batch<T, A> atan2(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::atan2<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the inverse hyperbolic tangent of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the inverse hyperbolic tangent of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> atanh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::atanh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a static_cast from \c T_in to \c T_out on \c \c x.
+     * @param x batch_bool of \c T_in
+     * @return \c x cast to \c T_out
+     */
+    template <class T_out, class T_in, class A>
+    inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T_out, A>();
+        detail::static_check_supported_config<T_in, A>();
+        static_assert(batch_bool<T_out, A>::size == batch_bool<T_in, A>::size, "Casting between incompatibles batch_bool types.");
+        return kernel::batch_bool_cast<A>(x, batch_bool<T_out, A> {}, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a static_cast from \c T_in to \c T_out on \c \c x.
+     * @param x batch of \c T_in
+     * @return \c x cast to \c T_out
+     */
+    template <class T_out, class T_in, class A>
+    inline batch<T_out, A> batch_cast(batch<T_in, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T_out, A>();
+        detail::static_check_supported_config<T_in, A>();
+        return kernel::batch_cast<A>(x, batch<T_out, A> {}, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Computes the bit of sign of \c x
+     * @param x batch of scalar
+     * @return bit of sign of \c x
+     */
+    template <class T, class A>
+    inline batch<T, A> bitofsign(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitofsign<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise and of the batches \c x and \c y.
+     * @param x batch involved in the operation.
+     * @param y batch involved in the operation.
+     * @return the result of the bitwise and.
+     */
+    template <class T, class A>
+    inline auto bitwise_and(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x & y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x & y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise and of the batches \c x and \c y.
+     * @param x batch involved in the operation.
+     * @param y batch involved in the operation.
+     * @return the result of the bitwise and.
+     */
+    template <class T, class A>
+    inline auto bitwise_and(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x & y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x & y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise and not of batches \c x and \c y.
+     * @param x batch involved in the operation.
+     * @param y batch involved in the operation.
+     * @return the result of the bitwise and not.
+     */
+    template <class T, class A>
+    inline batch<T, A> bitwise_andnot(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_andnot<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_bool_logical
+     *
+     * Computes the bitwise and not of batches \c x and \c y.
+     * @param x batch involved in the operation.
+     * @param y batch involved in the operation.
+     * @return the result of the bitwise and not.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_andnot<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a reinterpret_cast from \c T_in to \c T_out on \c x.
+     * @param x batch of \c T_in
+     * @return \c x reinterpreted as \c T_out
+     */
+    template <class T_out, class T_in, class A>
+    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T_in, A>();
+        detail::static_check_supported_config<T_out, A>();
+        return kernel::bitwise_cast<A>(x, batch<T_out, A> {}, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise not of batch \c x.
+     * @param x batch involved in the operation.
+     * @return the result of the bitwise not.
+     */
+    template <class T, class A>
+    inline batch<T, A> bitwise_not(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_not<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise not of batch \c x.
+     * @param x batch involved in the operation.
+     * @return the result of the bitwise not.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_not<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise or of the batches \c x and \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the bitwise or.
+     */
+    template <class T, class A>
+    inline auto bitwise_or(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x | y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x | y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise or of the batches \c x and \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the bitwise or.
+     */
+    template <class T, class A>
+    inline auto bitwise_or(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x | y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x | y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise xor of the batches \c x and \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the bitwise xor.
+     */
+    template <class T, class A>
+    inline auto bitwise_xor(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x ^ y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x ^ y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise xor of the batches \c x and \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the bitwise xor.
+     */
+    template <class T, class A>
+    inline auto bitwise_xor(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x ^ y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x ^ y;
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the single value \c v.
+     * @param v the value used to initialize the batch
+     * @return a new batch instance
+     */
+    template <class T, class A = default_arch>
+    inline batch<T, A> broadcast(T v) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch<T, A>::broadcast(v);
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the single value \c v and
+     * the specified batch value type \c To.
+     * @param v the value used to initialize the batch
+     * @return a new batch instance
+     */
+    template <class To, class A = default_arch, class From>
+    inline simd_return_type<From, To, A> broadcast_as(From v) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        using batch_value_type = typename simd_return_type<From, To, A>::value_type;
+        using value_type = typename std::conditional<std::is_same<From, bool>::value,
+                                                     bool,
+                                                     batch_value_type>::type;
+        return simd_return_type<From, To, A>(value_type(v));
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the cubic root of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the cubic root of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> cbrt(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::cbrt<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Computes the batch of smallest integer values not less than
+     * scalars in \c x.
+     * @param x batch of floating point values.
+     * @return the batch of smallest integer values not less than \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> ceil(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::ceil<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Clips the values of the batch \c x between those of the batches \c lo and \c hi.
+     * @param x batch of scalar values.
+     * @param lo batch of scalar values.
+     * @param hi batch of scalar values.
+     * @return the result of the clipping.
+     */
+    template <class T, class A>
+    inline batch<T, A> clip(batch<T, A> const& x, batch<T, A> const& lo, batch<T, A> const& hi) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::clip(x, lo, hi, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the conjugate of the batch \c z.
+     * @param z batch of complex values.
+     * @return the argument of \c z.
+     */
+    template <class A, class T>
+    inline complex_batch_type_t<batch<T, A>> conj(batch<T, A> const& z) noexcept
+    {
+        return kernel::conj(z, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Computes a value whose  absolute  value  matches
+     *        that of \c x, but whose sign bit matches that of \c y.
+     * @param x batch of scalars
+     * @param y batch of scalars
+     * @return batch whose absolute  value  matches that of \c x, but whose sign bit
+     * matches that of \c y.
+     */
+    template <class T, class A>
+    inline batch<T, A> copysign(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::copysign<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the cosine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the cosine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> cos(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::cos<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * computes the hyperbolic cosine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the hyperbolic cosine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> cosh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::cosh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the division of the batch \c x by the batch \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the division.
+     */
+    template <class T, class A>
+    inline auto div(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x / y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x / y;
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise equality comparison of batches \c x and \c y.
+     * @param x batch of scalars
+     * @param y batch of scalars
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline auto eq(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x == y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x == y;
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the natural exponential of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the natural exponential of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> exp(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::exp<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the base 10 exponential of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the base 10 exponential of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> exp10(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::exp10<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the base 2 exponential of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the base 2 exponential of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> exp2(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::exp2<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the natural exponential of the batch \c x, minus one.
+     * @param x batch of floating point values.
+     * @return the natural exponential of \c x, minus one.
+     */
+    template <class T, class A>
+    inline batch<T, A> expm1(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::expm1<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the error function of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the error function of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> erf(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::erf<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the complementary error function of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the error function of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> erfc(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::erfc<A>(x, A {});
+    }
+
+    /**
+     * Extract vector from pair of vectors
+     * extracts the lowest vector elements from the second source \c x
+     * and the highest vector elements from the first source \c y
+     * Concatenates the results into th Return value.
+     * @param x batch of integer or floating point values.
+     * @param y batch of integer or floating point values.
+     * @param i integer specifying the lowest vector element to extract from the first source register
+     * @return.
+     */
+    template <class T, class A>
+    inline batch<T, A> extract_pair(batch<T, A> const& x, batch<T, A> const& y, std::size_t i) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::extract_pair<A>(x, y, i, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the absolute values of each scalar in the batch \c x.
+     * @param x batch floating point values.
+     * @return the absolute values of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> fabs(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::abs<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the positive difference between \c x and \c y, that is,
+     * <tt>max(0, x-y)</tt>.
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return the positive difference.
+     */
+    template <class T, class A>
+    inline batch<T, A> fdim(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fdim<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Computes the batch of largest integer values not greater than
+     * scalars in \c x.
+     * @param x batch of floating point values.
+     * @return the batch of largest integer values not greater than \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> floor(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::floor<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes <tt>(x*y) + z</tt> in a single instruction when possible.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @param z a batch of integer or floating point values.
+     * @return the result of the fused multiply-add operation.
+     */
+    template <class T, class A>
+    inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fma<A>(x, y, z, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the larger values of the batches \c x and \c y.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return a batch of the larger values.
+     */
+    template <class T, class A>
+    inline batch<T, A> fmax(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::max<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the smaller values of the batches \c x and \c y.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return a batch of the smaller values.
+     */
+    template <class T, class A>
+    inline batch<T, A> fmin(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::min<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the modulo of the batch \c x by the batch \c y.
+     * @param x batch involved in the modulo.
+     * @param y batch involved in the modulo.
+     * @return the result of the modulo.
+     */
+    template <class T, class A>
+    inline batch<T, A> fmod(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fmod<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes <tt>(x*y) - z</tt> in a single instruction when possible.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @param z a batch of integer or floating point values.
+     * @return the result of the fused multiply-sub operation.
+     */
+    template <class T, class A>
+    inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fms<A>(x, y, z, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes <tt>-(x*y) + z</tt> in a single instruction when possible.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @param z a batch of integer or floating point values.
+     * @return the result of the fused negated multiply-add operation.
+     */
+    template <class T, class A>
+    inline batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fnma<A>(x, y, z, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes <tt>-(x*y) - z</tt> in a single instruction when possible.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @param z a batch of integer or floating point values.
+     * @return the result of the fused negated multiply-sub operation.
+     */
+    template <class T, class A>
+    inline batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fnms<A>(x, y, z, A {});
+    }
+
+    /**
+     * @ingroup batch_fp
+     *
+     * Split split the number x into a normalized fraction and an exponent which is stored in exp
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return the normalized fraction of x
+     */
+    template <class T, class A>
+    inline batch<T, A> frexp(const batch<T, A>& x, batch<as_integer_t<T>, A>& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::frexp<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise greater or equal comparison of batches \c x and \c y.
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> ge(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return x >= y;
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise greater than comparison of batches \c x and \c y.
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> gt(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return x > y;
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Generic reducer using only batch operations
+     * @param f reducing function, accepting `batch ()(batch, batch)`
+     * @param x batch involved in the reduction
+     * @return the result of the reduction, as a scalar.
+     */
+    template <class T, class A, class F>
+    inline T reduce(F&& f, batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::detail::reduce(std::forward<F>(f), x, std::integral_constant<unsigned, batch<T, A>::size>());
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Adds all the scalars of the batch \c x.
+     * @param x batch involved in the reduction
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    inline T reduce_add(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reduce_add<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Max of all the scalars of the batch \c x.
+     * @param x batch involved in the reduction
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    inline T reduce_max(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reduce_max<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Min of all the scalars of the batch \c x.
+     * @param x batch involved in the reduction
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    inline T reduce_min(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reduce_min<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Parallel horizontal addition: adds the scalars of each batch
+     * in the array pointed by \c row and store them in a returned
+     * batch.
+     * @param row an array of \c N batches
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    inline batch<T, A> haddp(batch<T, A> const* row) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::haddp<A>(row, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the square root of the sum of the squares of the batches
+     * \c x, and \c y.
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return the square root of the sum of the squares of \c x and \c y.
+     */
+    template <class T, class A>
+    inline batch<T, A> hypot(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::hypot<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the imaginary part of the batch \c x.
+     * @param x batch of complex or real values.
+     * @return the argument of \c x.
+     */
+    template <class T, class A>
+    inline real_batch_type_t<batch<T, A>> imag(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::imag<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_constant
+     *
+     * Return a batch of scalars representing positive infinity
+     * @return a batch of positive infinity
+     */
+    template <class B>
+    B infinity()
+    {
+        using T = typename B::value_type;
+        using A = typename B::arch_type;
+        detail::static_check_supported_config<T, A>();
+        return B(std::numeric_limits<T>::infinity());
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Create a new batch equivalent to \c x but with element \c val set at position \c pos
+     * @param x batch
+     * @param val value to set
+     * @param pos index of the updated slot
+     * @return copy of \c x with position \c pos set to \c val
+     */
+    template <class T, class A, size_t I>
+    inline batch<T, A> insert(batch<T, A> const& x, T val, index<I> pos) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::insert<A>(x, val, pos, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x represent an even integer value
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> is_even(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::is_even<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the floating-point scalars in the given batch \c x represent integer value
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> is_flint(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::is_flint<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x represent an odd integer value
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> is_odd(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::is_odd<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x are inf values.
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> isinf(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::isinf<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x are finite values.
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> isfinite(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::isfinite<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x are NaN values.
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    inline typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::isnan<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the multiplication of the floating point number \c x by 2 raised to the power \c y.
+     * @param x batch of floating point values.
+     * @param y batch of integer values.
+     * @return a batch of floating point values.
+     */
+    template <class T, class A>
+    inline batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::ldexp<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise lesser or equal to comparison of batches \c x and \c y.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> le(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return x <= y;
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the natural logarithm of the gamma function of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the natural logarithm of the gamma function of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> lgamma(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::lgamma<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr and the specifed
+     * batch value type \c To. The memory needs to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class To, class A = default_arch, class From>
+    inline simd_return_type<From, To, A> load_as(From const* ptr, aligned_mode) noexcept
+    {
+        using batch_value_type = typename simd_return_type<From, To, A>::value_type;
+        detail::static_check_supported_config<From, A>();
+        detail::static_check_supported_config<To, A>();
+        return kernel::load_aligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
+    template <class To, class A = default_arch>
+    inline simd_return_type<bool, To, A> load_as(bool const* ptr, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        return simd_return_type<bool, To, A>::load_aligned(ptr);
+    }
+
+    template <class To, class A = default_arch, class From>
+    inline simd_return_type<std::complex<From>, To, A> load_as(std::complex<From> const* ptr, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        using batch_value_type = typename simd_return_type<std::complex<From>, To, A>::value_type;
+        return kernel::load_complex_aligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class To, class A = default_arch, class From, bool i3ec>
+    inline simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        detail::static_check_supported_config<From, A>();
+        return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), aligned_mode());
+    }
+#endif
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr and the specifed
+     * batch value type \c To. The memory does not need to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class To, class A = default_arch, class From>
+    inline simd_return_type<From, To, A> load_as(From const* ptr, unaligned_mode) noexcept
+    {
+        using batch_value_type = typename simd_return_type<From, To, A>::value_type;
+        detail::static_check_supported_config<To, A>();
+        detail::static_check_supported_config<From, A>();
+        return kernel::load_unaligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
+    template <class To, class A = default_arch>
+    inline simd_return_type<bool, To, A> load_as(bool const* ptr, unaligned_mode) noexcept
+    {
+        return simd_return_type<bool, To, A>::load_unaligned(ptr);
+    }
+
+    template <class To, class A = default_arch, class From>
+    inline simd_return_type<std::complex<From>, To, A> load_as(std::complex<From> const* ptr, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        detail::static_check_supported_config<From, A>();
+        using batch_value_type = typename simd_return_type<std::complex<From>, To, A>::value_type;
+        return kernel::load_complex_unaligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class To, class A = default_arch, class From, bool i3ec>
+    inline simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        detail::static_check_supported_config<From, A>();
+        return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), unaligned_mode());
+    }
+#endif
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr. The
+     * memory needs to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class A = default_arch, class From>
+    inline batch<From, A> load(From const* ptr, aligned_mode = {}) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        return load_as<From, A>(ptr, aligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr. The
+     * memory does not need to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class A = default_arch, class From>
+    inline batch<From, A> load(From const* ptr, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        return load_as<From, A>(ptr, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr. The
+     * memory needs to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class A = default_arch, class From>
+    inline batch<From, A> load_aligned(From const* ptr) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        return load_as<From, A>(ptr, aligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr. The
+     * memory does not need to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class A = default_arch, class From>
+    inline batch<From, A> load_unaligned(From const* ptr) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        return load_as<From, A>(ptr, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the natural logarithm of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the natural logarithm of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> log(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::log<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     * Computes the base 2 logarithm of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the base 2 logarithm of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> log2(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::log2<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     * Computes the base 10 logarithm of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the base 10 logarithm of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> log10(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::log10<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     * Computes the natural logarithm of one plus the batch \c x.
+     * @param x batch of floating point values.
+     * @return the natural logarithm of one plus \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> log1p(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::log1p<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise lesser than comparison of batches \c x and \c y.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> lt(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return x < y;
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the larger values of the batches \c x and \c y.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return a batch of the larger values.
+     */
+    template <class T, class A>
+    inline batch<T, A> max(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::max<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the smaller values of the batches \c x and \c y.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return a batch of the smaller values.
+     */
+    template <class T, class A>
+    inline batch<T, A> min(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::min<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_constant
+     *
+     * Return a batch of scalars representing positive infinity
+     * @return a batch of positive infinity
+     */
+    template <class B>
+    inline B minusinfinity() noexcept
+    {
+        using T = typename B::value_type;
+        using A = typename B::arch_type;
+        detail::static_check_supported_config<T, A>();
+        return B(-std::numeric_limits<T>::infinity());
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the integer modulo of the batch \c x by the batch \c y.
+     * @param x batch involved in the modulo.
+     * @param y batch involved in the modulo.
+     * @return the result of the modulo.
+     */
+    template <class T, class A>
+    inline auto mod(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x % y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x % y;
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the product of the batches \c x and \c y.
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the product.
+     * @param y batch involved in the product.
+     * @return the result of the product.
+     */
+    template <class T, class A>
+    inline auto mul(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x * y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x * y;
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Rounds the scalars in \c x to integer values (in floating point format), using
+     * the current rounding mode.
+     * @param x batch of floating point values.
+     * @return the batch of nearest integer values.
+     */
+    template <class T, class A>
+    inline batch<T, A> nearbyint(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::nearbyint<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Rounds the scalars in \c x to integer values (in integer format) using
+     * the current rounding mode.
+     * @param x batch of floating point values.
+     * @return the batch of nearest integer values.
+     *
+     * @warning For very large values the conversion to int silently overflows.
+     */
+    template <class T, class A>
+    inline batch<as_integer_t<T>, A>
+    nearbyint_as_int(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::nearbyint_as_int(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise inequality comparison of batches \c x and \c y.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline auto neq(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x != y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x != y;
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the opposite of the batch \c x.
+     * @param x batch involved in the operation.
+     * @return the opposite of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> neg(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return -x;
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes  the next representable  floating-point
+     *        value  following  x  in the direction of y
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return \c x raised to the power \c y.
+     */
+    template <class T, class A>
+    inline batch<T, A> nextafter(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::nextafter<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the norm of the batch \c x.
+     * @param x batch of complex or real values.
+     * @return the norm of \c x.
+     */
+    template <class T, class A>
+    inline real_batch_type_t<batch<T, A>> norm(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::norm(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Returns a complex batch with magnitude \c r and phase angle \c theta.
+     * @param r The magnitude of the desired complex result.
+     * @param theta The phase angle of the desired complex result.
+     * @return \c r exp(i * \c theta).
+     */
+    template <class T, class A>
+    inline complex_batch_type_t<batch<T, A>> polar(batch<T, A> const& r, batch<T, A> const& theta = batch<T, A> {}) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::polar<A>(r, theta, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * No-op on \c x.
+     * @param x batch involved in the operation.
+     * @return \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> pos(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return +x;
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the value of the batch \c x raised to the power
+     * \c y.
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return \c x raised to the power \c y.
+     */
+    template <class T, class A>
+    inline batch<T, A> pow(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::pow<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the value of the batch \c x raised to the power
+     * \c y.
+     * @param x batch of integral values.
+     * @param y batch of integral values.
+     * @return \c x raised to the power \c y.
+     */
+    template <class T, class ITy, class A, class = typename std::enable_if<std::is_integral<ITy>::value, void>::type>
+    inline batch<T, A> pow(batch<T, A> const& x, ITy y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::ipow<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the projection of the batch \c z.
+     * @param z batch of complex or real values.
+     * @return the projection of \c z.
+     */
+    template <class T, class A>
+    inline complex_batch_type_t<batch<T, A>> proj(batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::proj(z, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the approximate reciprocal of the batch \c x.
+     * The maximum relative error for this approximation is
+     * less than 1.5*2^-12.
+     * @param x batch of floating point numbers.
+     * @return the reciprocal.
+     */
+    template <class T, class A, class = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
+    inline batch<T, A> reciprocal(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reciprocal(x, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the real part of the batch \c z.
+     * @param z batch of complex or real values.
+     * @return the argument of \c z.
+     */
+    template <class T, class A>
+    inline real_batch_type_t<batch<T, A>> real(batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::real<A>(z, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the remainder of dividing \c x by \c y
+     * @param x batch of scalar values
+     * @param y batch of scalar values
+     * @return the result of the addition.
+     */
+    template <class T, class A>
+    inline batch<T, A> remainder(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::remainder<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Rounds the scalars in \c x to integer values (in floating point format), using
+     * the current rounding mode.
+     * @param x batch of floating point values.
+     * @return the batch of rounded values.
+     */
+    template <class T, class A>
+    inline batch<T, A> rint(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return nearbyint(x);
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Computes the batch of nearest integer values to scalars in \c x (in
+     * floating point format), rounding halfway cases away from zero, regardless
+     * of the current rounding mode.
+     * @param x batch of flaoting point values.
+     * @return the batch of nearest integer values.
+     */
+    template <class T, class A>
+    inline batch<T, A> round(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::round<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes an estimate of the inverse square root of the batch \c x.
+     *
+     * @warning Unlike most xsimd function, this does not return the same result as the
+     * equivalent scalar operation, trading accuracy for speed.
+     *
+     * @param x batch of floating point values.
+     * @return the inverse square root of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> rsqrt(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rsqrt<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the saturate sum of the batch \c x and the batch \c y.
+
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the saturated addition.
+     * @param y batch involved in the saturated addition.
+     * @return the result of the saturated addition.
+     */
+    template <class T, class A>
+    inline batch<T, A> sadd(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sadd<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Ternary operator for batches: selects values from the batches \c true_br or \c false_br
+     * depending on the boolean values in the constant batch \c cond. Equivalent to
+     * \code{.cpp}
+     * for(std::size_t i = 0; i < N; ++i)
+     *     res[i] = cond[i] ? true_br[i] : false_br[i];
+     * \endcode
+     * @param cond batch condition.
+     * @param true_br batch values for truthy condition.
+     * @param false_br batch value for falsy condition.
+     * @return the result of the selection.
+     */
+    template <class T, class A>
+    inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::select<A>(cond, true_br, false_br, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Ternary operator for batches: selects values from the batches \c true_br or \c false_br
+     * depending on the boolean values in the constant batch \c cond. Equivalent to
+     * \code{.cpp}
+     * for(std::size_t i = 0; i < N; ++i)
+     *     res[i] = cond[i] ? true_br[i] : false_br[i];
+     * \endcode
+     * @param cond batch condition.
+     * @param true_br batch values for truthy condition.
+     * @param false_br batch value for falsy condition.
+     * @return the result of the selection.
+     */
+    template <class T, class A>
+    inline batch<std::complex<T>, A> select(batch_bool<T, A> const& cond, batch<std::complex<T>, A> const& true_br, batch<std::complex<T>, A> const& false_br) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::select<A>(cond, true_br, false_br, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Ternary operator for batches: selects values from the batches \c true_br or \c false_br
+     * depending on the boolean values in the constant batch \c cond. Equivalent to
+     * \code{.cpp}
+     * for(std::size_t i = 0; i < N; ++i)
+     *     res[i] = cond[i] ? true_br[i] : false_br[i];
+     * \endcode
+     * @param cond constant batch condition.
+     * @param true_br batch values for truthy condition.
+     * @param false_br batch value for falsy condition.
+     * @return the result of the selection.
+     */
+    template <class T, class A, bool... Values>
+    inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::select<A>(cond, true_br, false_br, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Computes the sign of \c x
+     * @param x batch
+     * @return -1 for each negative element, -1 or +1 for each null element and +1 for each element
+     */
+    template <class T, class A>
+    inline batch<T, A> sign(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sign<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Computes the sign of \c x, assuming x doesn't have any zero
+     * @param x batch
+     * @return -1 for each negative element, -1 or +1 for each null element and +1 for each element
+     */
+    template <class T, class A>
+    inline batch<T, A> signnz(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::signnz<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the sine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the sine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> sin(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sin<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the sine and the cosine of the batch \c x. This method is faster
+     * than calling sine and cosine independently.
+     * @param x batch of floating point values.
+     * @return a pair containing the sine then the cosine of  batch \c x
+     */
+    template <class T, class A>
+    inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sincos<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the hyperbolic sine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the hyperbolic sine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> sinh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sinh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Slide the whole batch to the left by \c n bytes. This is different from
+     * \c bitwise_lshift that shifts each batch element to the left.
+     *
+     * @tparam N Amount of bytes to slide to the left.
+     * @param x batch of integer values.
+     * @return slided batch.
+     */
+    template <size_t N, class T, class A>
+    inline batch<T, A> slide_left(batch<T, A> const& x) noexcept
+    {
+        static_assert(std::is_integral<T>::value, "can only slide batch of integers");
+        detail::static_check_supported_config<T, A>();
+        return kernel::slide_left<N, A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Slide the whole batch to the right by \c N bytes. This is different from
+     * \c bitwise_rshift that shifts each batch element to the right.
+     *
+     * @tparam N Amount of bytes to slide to the right.
+     * @param x batch of integer values.
+     * @return slided batch.
+     */
+    template <size_t N, class T, class A>
+    inline batch<T, A> slide_right(batch<T, A> const& x) noexcept
+    {
+        static_assert(std::is_integral<T>::value, "can only slide batch of integers");
+        detail::static_check_supported_config<T, A>();
+        return kernel::slide_right<N, A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the square root of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the square root of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> sqrt(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sqrt<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the saturate difference of the batch \c x and the batch \c y.
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the saturated difference.
+     * @param y batch involved in the saturated difference.
+     * @return the result of the saturated difference.
+     */
+    template <class T, class A>
+    inline batch<T, A> ssub(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::ssub<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c src to the buffer \c dst. The
+     * memory needs to be aligned.
+     * @param dst the memory buffer to write to
+     * @param src the batch to copy
+     */
+    template <class To, class A = default_arch, class From>
+    inline void store_as(To* dst, batch<From, A> const& src, aligned_mode) noexcept
+    {
+        kernel::store_aligned(dst, src, A {});
+    }
+
+    template <class A = default_arch, class From>
+    inline void store_as(bool* dst, batch_bool<From, A> const& src, aligned_mode) noexcept
+    {
+        kernel::store(src, dst, A {});
+    }
+
+    template <class To, class A = default_arch, class From>
+    inline void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
+    {
+        kernel::store_complex_aligned(dst, src, A {});
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class To, class A = default_arch, class From, bool i3ec>
+    inline void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
+    {
+        store_as(reinterpret_cast<std::complex<To>*>(dst), src, aligned_mode());
+    }
+#endif
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c src to the buffer \c dst. The
+     * memory does not need to be aligned.
+     * @param dst the memory buffer to write to
+     * @param src the batch to copy
+     */
+    template <class To, class A = default_arch, class From>
+    inline void store_as(To* dst, batch<From, A> const& src, unaligned_mode) noexcept
+    {
+        kernel::store_unaligned(dst, src, A {});
+    }
+
+    template <class A = default_arch, class From>
+    inline void store_as(bool* dst, batch_bool<From, A> const& src, unaligned_mode) noexcept
+    {
+        kernel::store(src, dst, A {});
+    }
+
+    template <class To, class A = default_arch, class From>
+    inline void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) noexcept
+    {
+        kernel::store_complex_unaligned(dst, src, A {});
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class To, class A = default_arch, class From, bool i3ec>
+    inline void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) noexcept
+    {
+        store_as(reinterpret_cast<std::complex<To>*>(dst), src, unaligned_mode());
+    }
+#endif
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c val to the buffer \c mem. The
+     * memory does not need to be aligned.
+     * @param mem the memory buffer to write to
+     * @param val the batch to copy from
+     */
+    template <class A, class T>
+    inline void store(T* mem, batch<T, A> const& val, aligned_mode = {}) noexcept
+    {
+        store_as<T, A>(mem, val, aligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c val to the buffer \c mem. The
+     * memory does not need to be aligned.
+     * @param mem the memory buffer to write to
+     * @param val the batch to copy from
+     */
+    template <class A, class T>
+    inline void store(T* mem, batch<T, A> const& val, unaligned_mode) noexcept
+    {
+        store_as<T, A>(mem, val, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c val to the buffer \c mem. The
+     * memory needs to be aligned.
+     * @param mem the memory buffer to write to
+     * @param val the batch to copy from
+     */
+    template <class A, class T>
+    inline void store_aligned(T* mem, batch<T, A> const& val) noexcept
+    {
+        store_as<T, A>(mem, val, aligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c val to the buffer \c mem. The
+     * memory does not need to be aligned.
+     * @param mem the memory buffer to write to
+     * @param val the batch to copy
+     */
+    template <class A, class T>
+    inline void store_unaligned(T* mem, batch<T, A> const& val) noexcept
+    {
+        store_as<T, A>(mem, val, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the difference between \c x and \c y
+     * @tparam X the actual type of batch.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the difference between \c x and \c y
+     */
+    template <class T, class A>
+    inline auto sub(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x - y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x - y;
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Rearrange elements from \c x according to mask \c mask
+     * @param x batch
+     * @param mask constant batch mask of integer elements of the same size as
+     * element of \c x
+     * @return swizzled batch
+     */
+    template <class T, class A, class Vt, Vt... Values>
+    inline typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
+    swizzle(batch<T, A> const& x, batch_constant<batch<Vt, A>, Values...> mask) noexcept
+    {
+        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
+        detail::static_check_supported_config<T, A>();
+        return kernel::swizzle<A>(x, mask, A {});
+    }
+    template <class T, class A, class Vt, Vt... Values>
+    inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& x, batch_constant<batch<Vt, A>, Values...> mask) noexcept
+    {
+        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
+        detail::static_check_supported_config<T, A>();
+        return kernel::swizzle<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the tangent of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the tangent of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> tan(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::tan<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the hyperbolic tangent of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the hyperbolic tangent of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> tanh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::tanh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the gamma function of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the gamma function of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> tgamma(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::tgamma<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a conversion from \c i to a value of an floating point type of the same size as \c T.
+     * This is equivalent to \c batch_cast<as_float_t<T>>(i)
+     * @param i batch of integers.
+     * @return \c i converted to a value of an floating point type of the same size as \c T
+     */
+    template <class T, class A>
+    inline batch<as_float_t<T>, A> to_float(batch<T, A> const& i) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch_cast<as_float_t<T>>(i);
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a conversion from \c x to a value of an integer type of the same size as \c T
+     * This is equivalent to \c batch_cast<as_integer_t<T>>(x)
+     * @param x batch.
+     * @return \c x converted to a value of an integer type of the same size as \c T
+     */
+    template <class T, class A>
+    inline batch<as_integer_t<T>, A> to_int(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch_cast<as_integer_t<T>>(x);
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Computes the batch of nearest integer values not greater in magnitude
+     * than scalars in \c x.
+     * @param x batch of floating point values.
+     * @return the batch of nearest integer values not greater in magnitude than \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> trunc(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::trunc<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Unpack and interleave data from the HIGH half of batches \c x and \c y.
+     * Store the results in the Return value.
+     * @param x a batch of integer or floating point or double precision values.
+     * @param y a batch of integer or floating point or double precision values.
+     * @return a batch of the high part of shuffled values.
+     */
+    template <class T, class A>
+    inline batch<T, A> zip_hi(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::zip_hi<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Unpack and interleave data from the LOW half of batches \c x and \c y.
+     * Store the results in the Return value.
+     * @param x a batch of integer or floating point or double precision values.
+     * @param y a batch of integer or floating point or double precision values.
+     * @return a batch of the low part of shuffled values.
+     */
+    template <class T, class A>
+    inline batch<T, A> zip_lo(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::zip_lo<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Cast a \c batch_bool of \c T into a \c batch of the same type using the
+     * following rule: if an element of \c self is true, it maps to -1 in the
+     * returned integral batch, otherwise it maps to 0.
+     *
+     * @param self batch_bool of \c T
+     * @return \c self cast to a \c batch of \c T
+     */
+    template <class T, class A, typename std::enable_if<std::is_integral<T>::value, int>::type = 3>
+    inline batch<T, A> bitwise_cast(batch_bool<T, A> const& self) noexcept
+    {
+        T z(0);
+        detail::static_check_supported_config<T, A>();
+        return select(self, batch<T, A>(T(~z)), batch<T, A>(z));
+    }
+
+    template <class T, class A, typename std::enable_if<std::is_floating_point<T>::value, int>::type = 3>
+    inline batch<T, A> bitwise_cast(batch_bool<T, A> const& self) noexcept
+    {
+        T z0(0), z1(0);
+        using int_type = as_unsigned_integer_t<T>;
+        int_type value(~int_type(0));
+        std::memcpy(&z1, &value, sizeof(int_type));
+        detail::static_check_supported_config<T, A>();
+        return select(self, batch<T, A>(z1), batch<T, A>(z0));
+    }
+
+    /**
+     * @ingroup batch_bool_reducers
+     *
+     * Returns true if all the boolean values in the batch are true,
+     * false otherwise.
+     * @param x the batch to reduce.
+     * @return a boolean scalar.
+     */
+    template <class T, class A>
+    inline bool all(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::all<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bool_reducers
+     *
+     * Return true if any of the boolean values in the batch is true,
+     * false otherwise.
+     * @param x the batch to reduce.
+     * @return a boolean scalar.
+     */
+    template <class T, class A>
+    inline bool any(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::any<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bool_reducers
+     *
+     * Return true if none of the boolean values in the batch is true,
+     * false otherwise.
+     * @param x the batch to reduce.
+     * @return a boolean scalar.
+     */
+    template <class T, class A>
+    inline bool none(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return !xsimd::any(x);
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Dump the content of batch \c x to stream \c o
+     * @param o the stream where the batch is dumped
+     * @param x batch to dump.
+     * @return a reference to \c o
+     */
+    template <class T, class A>
+    inline std::ostream& operator<<(std::ostream& o, batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        constexpr auto size = batch<T, A>::size;
+        alignas(A::alignment()) T buffer[size];
+        x.store_aligned(&buffer[0]);
+        o << '(';
+        for (std::size_t i = 0; i < size - 1; ++i)
+            o << buffer[i] << ", ";
+        return o << buffer[size - 1] << ')';
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Dump the content of batch \c x to stream \c o
+     * @param o the stream where the batch is dumped
+     * @param x batch to dump.
+     * @return a reference to \c o
+     */
+    template <class T, class A>
+    inline std::ostream& operator<<(std::ostream& o, batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        constexpr auto size = batch_bool<T, A>::size;
+        alignas(A::alignment()) bool buffer[size];
+        x.store_aligned(&buffer[0]);
+        o << '(';
+        for (std::size_t i = 0; i < size - 1; ++i)
+            o << buffer[i] << ", ";
+        return o << buffer[size - 1] << ')';
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx2_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx2_register.hpp
new file mode 100644
index 0000000000..a02cdf8489
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx2_register.hpp
@@ -0,0 +1,40 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX2_REGISTER_HPP
+#define XSIMD_AVX2_REGISTER_HPP
+
+#include "./xsimd_avx_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup arch
+     *
+     * AVX2 instructions
+     */
+    struct avx2 : avx
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX2; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(2, 2, 0); }
+        static constexpr char const* name() noexcept { return "avx2"; }
+    };
+
+#if XSIMD_WITH_AVX2
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx2, avx);
+    }
+#endif
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp
new file mode 100644
index 0000000000..49633f5db7
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp
@@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512BW_REGISTER_HPP
+#define XSIMD_AVX512BW_REGISTER_HPP
+
+#include "./xsimd_avx512dq_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup arch
+     *
+     * AVX512BW instructions
+     */
+    struct avx512bw : avx512dq
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512BW; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 4, 0); }
+        static constexpr char const* name() noexcept { return "avx512bw"; }
+    };
+
+#if XSIMD_WITH_AVX512BW
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512bw>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512bw, avx512dq);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp
new file mode 100644
index 0000000000..173d5817d6
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp
@@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512CD_REGISTER_HPP
+#define XSIMD_AVX512CD_REGISTER_HPP
+
+#include "./xsimd_avx512f_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup arch
+     *
+     * AVX512CD instrutions
+     */
+    struct avx512cd : avx512f
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512CD; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 2, 0); }
+        static constexpr char const* name() noexcept { return "avx512cd"; }
+    };
+
+#if XSIMD_WITH_AVX512CD
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512cd>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512cd, avx512f);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp
new file mode 100644
index 0000000000..41846e7115
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp
@@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512DQ_REGISTER_HPP
+#define XSIMD_AVX512DQ_REGISTER_HPP
+
+#include "./xsimd_avx512cd_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup arch
+     *
+     * AVX512DQ instructions
+     */
+    struct avx512dq : avx512cd
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512DQ; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 3, 0); }
+        static constexpr char const* name() noexcept { return "avx512dq"; }
+    };
+
+#if XSIMD_WITH_AVX512DQ
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512dq>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512dq, avx512cd);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp
new file mode 100644
index 0000000000..c24a91af5e
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp
@@ -0,0 +1,75 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512F_REGISTER_HPP
+#define XSIMD_AVX512F_REGISTER_HPP
+
+#include "./xsimd_generic_arch.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup arch
+     *
+     * AVX512F instructions
+     */
+    struct avx512f : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512F; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 1, 0); }
+        static constexpr std::size_t alignment() noexcept { return 64; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx512f"; }
+    };
+
+#if XSIMD_WITH_AVX512F
+
+    namespace types
+    {
+        template <class T>
+        struct simd_avx512_bool_register
+        {
+            using register_type = typename std::conditional<
+                (sizeof(T) < 4), std::conditional<(sizeof(T) == 1), __mmask64, __mmask32>,
+                std::conditional<(sizeof(T) == 4), __mmask16, __mmask8>>::type::type;
+            register_type data;
+            simd_avx512_bool_register() = default;
+            simd_avx512_bool_register(register_type r) { data = r; }
+            operator register_type() const noexcept { return data; }
+        };
+        template <class T>
+        struct get_bool_simd_register<T, avx512f>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER(bool, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(char, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(short, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(float, avx512f, __m512);
+        XSIMD_DECLARE_SIMD_REGISTER(double, avx512f, __m512d);
+
+    }
+#endif
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp
new file mode 100644
index 0000000000..596a6b7028
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp
@@ -0,0 +1,62 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX_REGISTER_HPP
+#define XSIMD_AVX_REGISTER_HPP
+
+#include "./xsimd_generic_arch.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup arch
+     *
+     * AVX instructions
+     */
+    struct avx : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(2, 1, 0); }
+        static constexpr std::size_t alignment() noexcept { return 32; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx"; }
+    };
+}
+
+#if XSIMD_WITH_AVX
+
+#include <immintrin.h>
+
+namespace xsimd
+{
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER(bool, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(char, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(short, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(float, avx, __m256);
+        XSIMD_DECLARE_SIMD_REGISTER(double, avx, __m256d);
+    }
+}
+#endif
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_batch.hpp b/third_party/xsimd/include/xsimd/types/xsimd_batch.hpp
new file mode 100644
index 0000000000..9aefc1b12c
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_batch.hpp
@@ -0,0 +1,1491 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_BATCH_HPP
+#define XSIMD_BATCH_HPP
+
+#include <cassert>
+#include <complex>
+
+#include "../config/xsimd_arch.hpp"
+#include "../memory/xsimd_alignment.hpp"
+#include "./xsimd_utils.hpp"
+
+namespace xsimd
+{
+    template <class T, class A = default_arch>
+    class batch;
+
+    namespace types
+    {
+        template <class T, class A>
+        struct integral_only_operators
+        {
+            inline batch<T, A>& operator%=(batch<T, A> const& other) noexcept;
+            inline batch<T, A>& operator>>=(int32_t other) noexcept;
+            inline batch<T, A>& operator>>=(batch<T, A> const& other) noexcept;
+            inline batch<T, A>& operator<<=(int32_t other) noexcept;
+            inline batch<T, A>& operator<<=(batch<T, A> const& other) noexcept;
+
+            /** Shorthand for xsimd::mod() */
+            friend batch<T, A> operator%(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                return batch<T, A>(self) %= other;
+            }
+
+            /** Shorthand for xsimd::bitwise_rshift() */
+            friend batch<T, A> operator>>(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                return batch<T, A>(self) >>= other;
+            }
+
+            /** Shorthand for xsimd::bitwise_lshift() */
+            friend batch<T, A> operator<<(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                return batch<T, A>(self) <<= other;
+            }
+
+            /** Shorthand for xsimd::bitwise_rshift() */
+            friend batch<T, A> operator>>(batch<T, A> const& self, int32_t other) noexcept
+            {
+                return batch<T, A>(self) >>= other;
+            }
+
+            /** Shorthand for xsimd::bitwise_lshift() */
+            friend batch<T, A> operator<<(batch<T, A> const& self, int32_t other) noexcept
+            {
+                return batch<T, A>(self) <<= other;
+            }
+        };
+        template <class A>
+        struct integral_only_operators<float, A>
+        {
+        };
+        template <class A>
+        struct integral_only_operators<double, A>
+        {
+        };
+
+    }
+
+    namespace details
+    {
+        // These functions are forwarded declared here so that they can be used by friend functions
+        // with batch<T, A>. Their implementation must appear only once the
+        // kernel implementations have been included.
+        template <class T, class A>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+
+        template <class T, class A>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+
+        template <class T, class A>
+        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+
+        template <class T, class A>
+        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+
+        template <class T, class A>
+        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+
+        template <class T, class A>
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+    }
+
+    /**
+     * @brief batch of integer or floating point values.
+     *
+     * Abstract representation of an SIMD register for floating point or integral
+     * value.
+     *
+     * @tparam T the type of the underlying values.
+     * @tparam A the architecture this batch is tied too.
+     **/
+    template <class T, class A>
+    class batch : public types::simd_register<T, A>, public types::integral_only_operators<T, A>
+    {
+
+    public:
+        static constexpr std::size_t size = sizeof(types::simd_register<T, A>) / sizeof(T); ///< Number of scalar elements in this batch.
+
+        using value_type = T; ///< Type of the scalar elements within this batch.
+        using arch_type = A; ///< SIMD Architecture abstracted by this batch.
+        using register_type = typename types::simd_register<T, A>::register_type; ///< SIMD register type abstracted by this batch.
+        using batch_bool_type = batch_bool<T, A>; ///< Associated batch type used to represented logical operations on this batch.
+
+        // constructors
+        batch() = default; ///< Create a batch initialized with undefined values.
+        batch(T val) noexcept;
+        template <class... Ts>
+        batch(T val0, T val1, Ts... vals) noexcept;
+        explicit batch(batch_bool_type const& b) noexcept;
+        batch(register_type reg) noexcept;
+
+        template <class U>
+        XSIMD_NO_DISCARD static batch broadcast(U val) noexcept;
+
+        // memory operators
+        template <class U>
+        void store_aligned(U* mem) const noexcept;
+        template <class U>
+        void store_unaligned(U* mem) const noexcept;
+        template <class U>
+        void store(U* mem, aligned_mode) const noexcept;
+        template <class U>
+        void store(U* mem, unaligned_mode) const noexcept;
+
+        template <class U>
+        XSIMD_NO_DISCARD static batch load_aligned(U const* mem) noexcept;
+        template <class U>
+        XSIMD_NO_DISCARD static batch load_unaligned(U const* mem) noexcept;
+        template <class U>
+        XSIMD_NO_DISCARD static batch load(U const* mem, aligned_mode) noexcept;
+        template <class U>
+        XSIMD_NO_DISCARD static batch load(U const* mem, unaligned_mode) noexcept;
+
+        template <class U, class V>
+        XSIMD_NO_DISCARD static batch gather(U const* src, batch<V, arch_type> const& index) noexcept;
+        template <class U, class V>
+        void scatter(U* dst, batch<V, arch_type> const& index) const noexcept;
+
+        T get(std::size_t i) const noexcept;
+
+        // comparison operators. Defined as friend to enable automatic
+        // conversion of parameters from scalar to batch, at the cost of using a
+        // proxy implementation from details::.
+        friend batch_bool<T, A> operator==(batch const& self, batch const& other) noexcept
+        {
+            return details::eq<T, A>(self, other);
+        }
+        friend batch_bool<T, A> operator!=(batch const& self, batch const& other) noexcept
+        {
+            return details::neq<T, A>(self, other);
+        }
+        friend batch_bool<T, A> operator>=(batch const& self, batch const& other) noexcept
+        {
+            return details::ge<T, A>(self, other);
+        }
+        friend batch_bool<T, A> operator<=(batch const& self, batch const& other) noexcept
+        {
+            return details::le<T, A>(self, other);
+        }
+        friend batch_bool<T, A> operator>(batch const& self, batch const& other) noexcept
+        {
+            return details::gt<T, A>(self, other);
+        }
+        friend batch_bool<T, A> operator<(batch const& self, batch const& other) noexcept
+        {
+            return details::lt<T, A>(self, other);
+        }
+
+        // Update operators
+        inline batch& operator+=(batch const& other) noexcept;
+        inline batch& operator-=(batch const& other) noexcept;
+        inline batch& operator*=(batch const& other) noexcept;
+        inline batch& operator/=(batch const& other) noexcept;
+        inline batch& operator&=(batch const& other) noexcept;
+        inline batch& operator|=(batch const& other) noexcept;
+        inline batch& operator^=(batch const& other) noexcept;
+
+        // incr/decr operators
+        inline batch& operator++() noexcept;
+        inline batch& operator--() noexcept;
+        inline batch operator++(int) noexcept;
+        inline batch operator--(int) noexcept;
+
+        // unary operators
+        inline batch_bool_type operator!() const noexcept;
+        inline batch operator~() const noexcept;
+        inline batch operator-() const noexcept;
+        inline batch operator+() const noexcept;
+
+        // arithmetic operators. They are defined as friend to enable automatic
+        // conversion of parameters from scalar to batch. Inline implementation
+        // is required to avoid warnings.
+
+        /** Shorthand for xsimd::add() */
+        friend batch operator+(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) += other;
+        }
+
+        /** Shorthand for xsimd::sub() */
+        friend batch operator-(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) -= other;
+        }
+
+        /** Shorthand for xsimd::mul() */
+        friend batch operator*(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) *= other;
+        }
+
+        /** Shorthand for xsimd::div() */
+        friend batch operator/(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) /= other;
+        }
+
+        /** Shorthand for xsimd::bitwise_and() */
+        friend batch operator&(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) &= other;
+        }
+
+        /** Shorthand for xsimd::bitwise_or() */
+        friend batch operator|(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) |= other;
+        }
+
+        /** Shorthand for xsimd::bitwise_xor() */
+        friend batch operator^(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) ^= other;
+        }
+
+        /** Shorthand for xsimd::logical_and() */
+        friend batch operator&&(batch const& self, batch const& other) noexcept
+        {
+            return batch(self).logical_and(other);
+        }
+
+        /** Shorthand for xsimd::logical_or() */
+        friend batch operator||(batch const& self, batch const& other) noexcept
+        {
+            return batch(self).logical_or(other);
+        }
+
+    private:
+        batch logical_and(batch const& other) const noexcept;
+        batch logical_or(batch const& other) const noexcept;
+    };
+
+    template <class T, class A>
+    constexpr std::size_t batch<T, A>::size;
+
+    /**
+     * @brief batch of predicate over scalar or complex values.
+     *
+     * Abstract representation of a predicate over SIMD register for scalar or
+     * complex values.
+     *
+     * @tparam T the type of the predicated values.
+     * @tparam A the architecture this batch is tied too.
+     **/
+    template <class T, class A = default_arch>
+    class batch_bool : public types::get_bool_simd_register_t<T, A>
+    {
+        using base_type = types::get_bool_simd_register_t<T, A>;
+
+    public:
+        static constexpr std::size_t size = sizeof(types::simd_register<T, A>) / sizeof(T); ///< Number of scalar elements in this batch.
+
+        using value_type = bool; ///< Type of the scalar elements within this batch.
+        using arch_type = A; ///< SIMD Architecture abstracted by this batch.
+        using register_type = typename base_type::register_type; ///< SIMD register type abstracted by this batch.
+        using batch_type = batch<T, A>; ///< Associated batch type this batch represents logical operations for.
+
+        // constructors
+        batch_bool() = default; ///< Create a batch initialized with undefined values.
+        batch_bool(bool val) noexcept;
+        batch_bool(register_type reg) noexcept;
+        template <class... Ts>
+        batch_bool(bool val0, bool val1, Ts... vals) noexcept;
+
+        template <class Tp>
+        batch_bool(Tp const*) = delete;
+
+        // memory operators
+        void store_aligned(bool* mem) const noexcept;
+        void store_unaligned(bool* mem) const noexcept;
+        XSIMD_NO_DISCARD static batch_bool load_aligned(bool const* mem) noexcept;
+        XSIMD_NO_DISCARD static batch_bool load_unaligned(bool const* mem) noexcept;
+
+        bool get(std::size_t i) const noexcept;
+
+        // mask operations
+        uint64_t mask() const noexcept;
+        static batch_bool from_mask(uint64_t mask) noexcept;
+
+        // comparison operators
+        batch_bool operator==(batch_bool const& other) const noexcept;
+        batch_bool operator!=(batch_bool const& other) const noexcept;
+
+        // logical operators
+        batch_bool operator~() const noexcept;
+        batch_bool operator!() const noexcept;
+        batch_bool operator&(batch_bool const& other) const noexcept;
+        batch_bool operator|(batch_bool const& other) const noexcept;
+        batch_bool operator^(batch_bool const& other) const noexcept;
+        batch_bool operator&&(batch_bool const& other) const noexcept;
+        batch_bool operator||(batch_bool const& other) const noexcept;
+
+        // update operators
+        batch_bool& operator&=(batch_bool const& other) const noexcept { return (*this) = (*this) & other; }
+        batch_bool& operator|=(batch_bool const& other) const noexcept { return (*this) = (*this) | other; }
+        batch_bool& operator^=(batch_bool const& other) const noexcept { return (*this) = (*this) ^ other; }
+
+    private:
+        template <class U, class... V, size_t I, size_t... Is>
+        static register_type make_register(detail::index_sequence<I, Is...>, U u, V... v) noexcept;
+
+        template <class... V>
+        static register_type make_register(detail::index_sequence<>, V... v) noexcept;
+    };
+
+    template <class T, class A>
+    constexpr std::size_t batch_bool<T, A>::size;
+
+    /**
+     * @brief batch of complex values.
+     *
+     * Abstract representation of an SIMD register for complex values.
+     *
+     * @tparam T the type of the underlying values.
+     * @tparam A the architecture this batch is tied too.
+     **/
+    template <class T, class A>
+    class batch<std::complex<T>, A>
+    {
+    public:
+        using value_type = std::complex<T>; ///< Type of the complex elements within this batch.
+        using real_batch = batch<T, A>; ///< Type of the scalar elements within this batch.
+        using arch_type = A; ///< SIMD Architecture abstracted by this batch.
+        using batch_bool_type = batch_bool<T, A>; ///< Associated batch type used to represented logical operations on this batch.
+
+        static constexpr std::size_t size = real_batch::size; ///< Number of complex elements in this batch.
+
+        // constructors
+        batch() = default; ///< Create a batch initialized with undefined values.
+        batch(value_type const& val) noexcept;
+        batch(real_batch const& real, real_batch const& imag) noexcept;
+
+        batch(real_batch const& real) noexcept;
+        batch(T val) noexcept;
+        template <class... Ts>
+        batch(value_type val0, value_type val1, Ts... vals) noexcept;
+        explicit batch(batch_bool_type const& b) noexcept;
+
+        template <class U>
+        XSIMD_NO_DISCARD static batch broadcast(U val) noexcept;
+
+        // memory operators
+        XSIMD_NO_DISCARD static batch load_aligned(const T* real_src, const T* imag_src = nullptr) noexcept;
+        XSIMD_NO_DISCARD static batch load_unaligned(const T* real_src, const T* imag_src = nullptr) noexcept;
+        void store_aligned(T* real_dst, T* imag_dst) const noexcept;
+        void store_unaligned(T* real_dst, T* imag_dst) const noexcept;
+
+        XSIMD_NO_DISCARD static batch load_aligned(const value_type* src) noexcept;
+        XSIMD_NO_DISCARD static batch load_unaligned(const value_type* src) noexcept;
+        void store_aligned(value_type* dst) const noexcept;
+        void store_unaligned(value_type* dst) const noexcept;
+
+        template <class U>
+        XSIMD_NO_DISCARD static batch load(U const* mem, aligned_mode) noexcept;
+        template <class U>
+        XSIMD_NO_DISCARD static batch load(U const* mem, unaligned_mode) noexcept;
+        template <class U>
+        void store(U* mem, aligned_mode) const noexcept;
+        template <class U>
+        void store(U* mem, unaligned_mode) const noexcept;
+
+        real_batch real() const noexcept;
+        real_batch imag() const noexcept;
+
+        value_type get(std::size_t i) const noexcept;
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        // xtl-related methods
+        template <bool i3ec>
+        batch(xtl::xcomplex<T, T, i3ec> const& val) noexcept;
+        template <bool i3ec, class... Ts>
+        batch(xtl::xcomplex<T, T, i3ec> val0, xtl::xcomplex<T, T, i3ec> val1, Ts... vals) noexcept;
+
+        template <bool i3ec>
+        XSIMD_NO_DISCARD static batch load_aligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept;
+        template <bool i3ec>
+        XSIMD_NO_DISCARD static batch load_unaligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept;
+        template <bool i3ec>
+        void store_aligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept;
+        template <bool i3ec>
+        void store_unaligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept;
+#endif
+
+        // comparison operators
+        batch_bool<T, A> operator==(batch const& other) const noexcept;
+        batch_bool<T, A> operator!=(batch const& other) const noexcept;
+
+        // Update operators
+        batch& operator+=(batch const& other) noexcept;
+        batch& operator-=(batch const& other) noexcept;
+        batch& operator*=(batch const& other) noexcept;
+        batch& operator/=(batch const& other) noexcept;
+
+        // incr/decr operators
+        batch& operator++() noexcept;
+        batch& operator--() noexcept;
+        batch operator++(int) noexcept;
+        batch operator--(int) noexcept;
+
+        // unary operators
+        batch_bool_type operator!() const noexcept;
+        batch operator~() const noexcept;
+        batch operator-() const noexcept;
+        batch operator+() const noexcept;
+
+        // arithmetic operators. They are defined as friend to enable automatic
+        // conversion of parameters from scalar to batch
+
+        /** Shorthand for xsimd::add() */
+        friend batch operator+(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) += other;
+        }
+
+        /** Shorthand for xsimd::sub() */
+        friend batch operator-(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) -= other;
+        }
+
+        /** Shorthand for xsimd::mul() */
+        friend batch operator*(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) *= other;
+        }
+
+        /** Shorthand for xsimd::div() */
+        friend batch operator/(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) /= other;
+        }
+
+    private:
+        real_batch m_real;
+        real_batch m_imag;
+    };
+
+    template <class T, class A>
+    constexpr std::size_t batch<std::complex<T>, A>::size;
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <typename T, bool i3ec, typename A>
+    struct batch<xtl::xcomplex<T, T, i3ec>, A>
+    {
+        static_assert(std::is_same<T, void>::value,
+                      "Please use batch<std::complex<T>, A> initialized from xtl::xcomplex instead");
+    };
+#endif
+}
+
+#include "../arch/xsimd_isa.hpp"
+#include "./xsimd_batch_constant.hpp"
+#include "./xsimd_traits.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * Create a batch with all element initialized to \c val.
+     */
+    template <class T, class A>
+    inline batch<T, A>::batch(T val) noexcept
+        : types::simd_register<T, A>(kernel::broadcast<A>(val, A {}))
+    {
+        detail::static_check_supported_config<T, A>();
+    }
+
+    /**
+     * Create a batch with elements initialized from \c val0, \c val1, \c vals...
+     * There must be exactly \c size elements in total.
+     */
+    template <class T, class A>
+    template <class... Ts>
+    inline batch<T, A>::batch(T val0, T val1, Ts... vals) noexcept
+        : batch(kernel::set<A>(batch {}, A {}, val0, val1, static_cast<T>(vals)...))
+    {
+        detail::static_check_supported_config<T, A>();
+        static_assert(sizeof...(Ts) + 2 == size, "The constructor requires as many arguments as batch elements.");
+    }
+
+    /**
+     * Converts a \c bool_batch to a \c batch where each element is
+     * set to 0xFF..FF (resp. 0x00..00) if the corresponding element is `true`
+     * (resp. `false`).
+     */
+    template <class T, class A>
+    inline batch<T, A>::batch(batch_bool<T, A> const& b) noexcept
+        : batch(kernel::from_bool(b, A {}))
+    {
+    }
+
+    /**
+     * Wraps a compatible native simd register as a \c batch. This is generally not needed but
+     * becomes handy when doing architecture-specific operations.
+     */
+    template <class T, class A>
+    inline batch<T, A>::batch(register_type reg) noexcept
+        : types::simd_register<T, A>({ reg })
+    {
+        detail::static_check_supported_config<T, A>();
+    }
+
+    /**
+     * Equivalent to batch::batch(T val).
+     */
+    template <class T, class A>
+    template <class U>
+    XSIMD_NO_DISCARD inline batch<T, A> batch<T, A>::broadcast(U val) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch(static_cast<T>(val));
+    }
+
+    /**************************
+     * batch memory operators *
+     **************************/
+
+    /**
+     * Copy content of this batch to the buffer \c mem. The
+     * memory needs to be aligned.
+     */
+    template <class T, class A>
+    template <class U>
+    inline void batch<T, A>::store_aligned(U* mem) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        assert(((reinterpret_cast<uintptr_t>(mem) % A::alignment()) == 0)
+               && "store location is not properly aligned");
+        kernel::store_aligned<A>(mem, *this, A {});
+    }
+
+    /**
+     * Copy content of this batch to the buffer \c mem. The
+     * memory does not need to be aligned.
+     */
+    template <class T, class A>
+    template <class U>
+    inline void batch<T, A>::store_unaligned(U* mem) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        kernel::store_unaligned<A>(mem, *this, A {});
+    }
+
+    /**
+     * Equivalent to batch::store_aligned()
+     */
+    template <class T, class A>
+    template <class U>
+    inline void batch<T, A>::store(U* mem, aligned_mode) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return store_aligned(mem);
+    }
+
+    /**
+     * Equivalent to batch::store_unaligned()
+     */
+    template <class T, class A>
+    template <class U>
+    inline void batch<T, A>::store(U* mem, unaligned_mode) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return store_unaligned(mem);
+    }
+
+    /**
+     * Loading from aligned memory. May involve a conversion if \c U is different
+     * from \c T.
+     */
+    template <class T, class A>
+    template <class U>
+    inline batch<T, A> batch<T, A>::load_aligned(U const* mem) noexcept
+    {
+        assert(((reinterpret_cast<uintptr_t>(mem) % A::alignment()) == 0)
+               && "loaded pointer is not properly aligned");
+        detail::static_check_supported_config<T, A>();
+        return kernel::load_aligned<A>(mem, kernel::convert<T> {}, A {});
+    }
+
+    /**
+     * Loading from unaligned memory. May involve a conversion if \c U is different
+     * from \c T.
+     */
+    template <class T, class A>
+    template <class U>
+    inline batch<T, A> batch<T, A>::load_unaligned(U const* mem) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::load_unaligned<A>(mem, kernel::convert<T> {}, A {});
+    }
+
+    /**
+     * Equivalent to batch::load_aligned()
+     */
+    template <class T, class A>
+    template <class U>
+    inline batch<T, A> batch<T, A>::load(U const* mem, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return load_aligned(mem);
+    }
+
+    /**
+     * Equivalent to batch::load_unaligned()
+     */
+    template <class T, class A>
+    template <class U>
+    inline batch<T, A> batch<T, A>::load(U const* mem, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return load_unaligned(mem);
+    }
+
+    /**
+     * Create a new batch gathering elements starting at address \c src and
+     * offset by each element in \c index.
+     * If \c T is not of the same size as \c U, a \c static_cast is performed
+     * at element gather time.
+     */
+    template <class T, class A>
+    template <typename U, typename V>
+    inline batch<T, A> batch<T, A>::gather(U const* src, batch<V, A> const& index) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        static_assert(std::is_convertible<T, U>::value, "Can't convert from src to this batch's type!");
+        return kernel::gather(batch {}, src, index, A {});
+    }
+
+    /**
+     * Scatter elements from this batch into addresses starting at \c dst
+     * and offset by each element in \c index.
+     * If \c T is not of the same size as \c U, a \c static_cast is performed
+     * at element scatter time.
+     */
+    template <class T, class A>
+    template <class U, class V>
+    inline void batch<T, A>::scatter(U* dst, batch<V, A> const& index) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        static_assert(std::is_convertible<T, U>::value, "Can't convert from this batch's type to dst!");
+        kernel::scatter<A>(*this, dst, index, A {});
+    }
+
+    /**
+     * Retrieve the \c i th scalar element in this batch.
+     *
+     * \c warning This is very inefficient and should only be used for debugging purpose.
+     */
+    template <class T, class A>
+    inline T batch<T, A>::get(std::size_t i) const noexcept
+    {
+        return kernel::get(*this, i, A {});
+    }
+
+    /******************************
+     * batch comparison operators *
+     ******************************/
+    namespace details
+    {
+        /**
+         * Shorthand for xsimd::eq()
+         */
+        template <class T, class A>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        {
+            detail::static_check_supported_config<T, A>();
+            return kernel::eq<A>(self, other, A {});
+        }
+
+        /**
+         * Shorthand for xsimd::neq()
+         */
+        template <class T, class A>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        {
+            detail::static_check_supported_config<T, A>();
+            return kernel::neq<A>(self, other, A {});
+        }
+
+        /**
+         * Shorthand for xsimd::ge()
+         */
+        template <class T, class A>
+        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        {
+            detail::static_check_supported_config<T, A>();
+            return kernel::ge<A>(self, other, A {});
+        }
+
+        /**
+         * Shorthand for xsimd::le()
+         */
+        template <class T, class A>
+        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        {
+            detail::static_check_supported_config<T, A>();
+            return kernel::le<A>(self, other, A {});
+        }
+
+        /**
+         * Shorthand for xsimd::gt()
+         */
+        template <class T, class A>
+        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        {
+            detail::static_check_supported_config<T, A>();
+            return kernel::gt<A>(self, other, A {});
+        }
+
+        /**
+         * Shorthand for xsimd::lt()
+         */
+        template <class T, class A>
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        {
+            detail::static_check_supported_config<T, A>();
+            return kernel::lt<A>(self, other, A {});
+        }
+    }
+
+    /**************************
+     * batch update operators *
+     **************************/
+
+    template <class T, class A>
+    inline batch<T, A>& batch<T, A>::operator+=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::add<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& batch<T, A>::operator-=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::sub<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& batch<T, A>::operator*=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::mul<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& batch<T, A>::operator/=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::div<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& types::integral_only_operators<T, A>::operator%=(batch<T, A> const& other) noexcept
+    {
+        ::xsimd::detail::static_check_supported_config<T, A>();
+        return *static_cast<batch<T, A>*>(this) = kernel::mod<A>(*static_cast<batch<T, A>*>(this), other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& batch<T, A>::operator&=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::bitwise_and<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& batch<T, A>::operator|=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::bitwise_or<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& batch<T, A>::operator^=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::bitwise_xor<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& kernel::integral_only_operators<T, A>::operator>>=(batch<T, A> const& other) noexcept
+    {
+        ::xsimd::detail::static_check_supported_config<T, A>();
+        return *static_cast<batch<T, A>*>(this) = kernel::bitwise_rshift<A>(*static_cast<batch<T, A>*>(this), other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& kernel::integral_only_operators<T, A>::operator<<=(batch<T, A> const& other) noexcept
+    {
+        ::xsimd::detail::static_check_supported_config<T, A>();
+        return *static_cast<batch<T, A>*>(this) = kernel::bitwise_lshift<A>(*static_cast<batch<T, A>*>(this), other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& kernel::integral_only_operators<T, A>::operator>>=(int32_t other) noexcept
+    {
+        ::xsimd::detail::static_check_supported_config<T, A>();
+        return *static_cast<batch<T, A>*>(this) = kernel::bitwise_rshift<A>(*static_cast<batch<T, A>*>(this), other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& kernel::integral_only_operators<T, A>::operator<<=(int32_t other) noexcept
+    {
+        ::xsimd::detail::static_check_supported_config<T, A>();
+        return *static_cast<batch<T, A>*>(this) = kernel::bitwise_lshift<A>(*static_cast<batch<T, A>*>(this), other, A {});
+    }
+
+    /*****************************
+     * batch incr/decr operators *
+     *****************************/
+
+    template <class T, class A>
+    inline batch<T, A>& batch<T, A>::operator++() noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return operator+=(1);
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& batch<T, A>::operator--() noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return operator-=(1);
+    }
+
+    template <class T, class A>
+    inline batch<T, A> batch<T, A>::operator++(int) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        batch<T, A> copy(*this);
+        operator+=(1);
+        return copy;
+    }
+
+    template <class T, class A>
+    inline batch<T, A> batch<T, A>::operator--(int) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        batch copy(*this);
+        operator-=(1);
+        return copy;
+    }
+
+    /*************************
+     * batch unary operators *
+     *************************/
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch<T, A>::operator!() const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::eq<A>(*this, batch(0), A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A> batch<T, A>::operator~() const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_not<A>(*this, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A> batch<T, A>::operator-() const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::neg<A>(*this, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A> batch<T, A>::operator+() const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this;
+    }
+
+    /************************
+     * batch private method *
+     ************************/
+
+    template <class T, class A>
+    inline batch<T, A> batch<T, A>::logical_and(batch<T, A> const& other) const noexcept
+    {
+        return kernel::logical_and<A>(*this, other, A());
+    }
+
+    template <class T, class A>
+    inline batch<T, A> batch<T, A>::logical_or(batch<T, A> const& other) const noexcept
+    {
+        return kernel::logical_or<A>(*this, other, A());
+    }
+
+    /***************************
+     * batch_bool constructors *
+     ***************************/
+
+    template <class T, class A>
+    inline batch_bool<T, A>::batch_bool(register_type reg) noexcept
+        : types::get_bool_simd_register_t<T, A>({ reg })
+    {
+    }
+
+    template <class T, class A>
+    template <class... Ts>
+    inline batch_bool<T, A>::batch_bool(bool val0, bool val1, Ts... vals) noexcept
+        : batch_bool(kernel::set<A>(batch_bool {}, A {}, val0, val1, static_cast<bool>(vals)...))
+    {
+        static_assert(sizeof...(Ts) + 2 == size, "The constructor requires as many arguments as batch elements.");
+    }
+
+    /*******************************
+     * batch_bool memory operators *
+     *******************************/
+
+    template <class T, class A>
+    inline void batch_bool<T, A>::store_aligned(bool* mem) const noexcept
+    {
+        kernel::store(*this, mem, A {});
+    }
+
+    template <class T, class A>
+    inline void batch_bool<T, A>::store_unaligned(bool* mem) const noexcept
+    {
+        store_aligned(mem);
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::load_aligned(bool const* mem) noexcept
+    {
+        batch_type ref(0);
+        alignas(A::alignment()) T buffer[size];
+        for (std::size_t i = 0; i < size; ++i)
+            buffer[i] = mem[i] ? 1 : 0;
+        return ref != batch_type::load_aligned(&buffer[0]);
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::load_unaligned(bool const* mem) noexcept
+    {
+        return load_aligned(mem);
+    }
+
+    /**
+     * Extract a scalar mask representation from this @c batch_bool.
+     *
+     * @return bit mask
+     */
+    template <class T, class A>
+    inline uint64_t batch_bool<T, A>::mask() const noexcept
+    {
+        return kernel::mask(*this, A {});
+    }
+
+    /**
+     * Extract a scalar mask representation from this @c batch_bool.
+     *
+     * @return bit mask
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::from_mask(uint64_t mask) noexcept
+    {
+        return kernel::from_mask(batch_bool<T, A>(), mask, A {});
+    }
+
+    template <class T, class A>
+    inline bool batch_bool<T, A>::get(std::size_t i) const noexcept
+    {
+        return kernel::get(*this, i, A {});
+    }
+
+    /***********************************
+     * batch_bool comparison operators *
+     ***********************************/
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::operator==(batch_bool<T, A> const& other) const noexcept
+    {
+        return kernel::eq<A>(*this, other, A {}).data;
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::operator!=(batch_bool<T, A> const& other) const noexcept
+    {
+        return kernel::neq<A>(*this, other, A {}).data;
+    }
+
+    /********************************
+     * batch_bool logical operators *
+     ********************************/
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::operator~() const noexcept
+    {
+        return kernel::bitwise_not<A>(*this, A {}).data;
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::operator!() const noexcept
+    {
+        return operator==(batch_bool(false));
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::operator&(batch_bool<T, A> const& other) const noexcept
+    {
+        return kernel::bitwise_and<A>(*this, other, A {}).data;
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::operator|(batch_bool<T, A> const& other) const noexcept
+    {
+        return kernel::bitwise_or<A>(*this, other, A {}).data;
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::operator^(batch_bool<T, A> const& other) const noexcept
+    {
+        return kernel::bitwise_xor<A>(*this, other, A {}).data;
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::operator&&(batch_bool const& other) const noexcept
+    {
+        return operator&(other);
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::operator||(batch_bool const& other) const noexcept
+    {
+        return operator|(other);
+    }
+
+    /******************************
+     * batch_bool private methods *
+     ******************************/
+
+    template <class T, class A>
+    inline batch_bool<T, A>::batch_bool(bool val) noexcept
+        : base_type { make_register(detail::make_index_sequence<size - 1>(), val) }
+    {
+    }
+
+    template <class T, class A>
+    template <class U, class... V, size_t I, size_t... Is>
+    inline auto batch_bool<T, A>::make_register(detail::index_sequence<I, Is...>, U u, V... v) noexcept -> register_type
+    {
+        return make_register(detail::index_sequence<Is...>(), u, u, v...);
+    }
+
+    template <class T, class A>
+    template <class... V>
+    inline auto batch_bool<T, A>::make_register(detail::index_sequence<>, V... v) noexcept -> register_type
+    {
+        return kernel::set<A>(batch_bool<T, A>(), A {}, v...).data;
+    }
+
+    /*******************************
+     * batch<complex> constructors *
+     *******************************/
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>::batch(value_type const& val) noexcept
+        : m_real(val.real())
+        , m_imag(val.imag())
+    {
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>::batch(real_batch const& real, real_batch const& imag) noexcept
+        : m_real(real)
+        , m_imag(imag)
+    {
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>::batch(real_batch const& real) noexcept
+        : m_real(real)
+        , m_imag(0)
+    {
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>::batch(T val) noexcept
+        : m_real(val)
+        , m_imag(0)
+    {
+    }
+
+    template <class T, class A>
+    template <class... Ts>
+    inline batch<std::complex<T>, A>::batch(value_type val0, value_type val1, Ts... vals) noexcept
+        : batch(kernel::set<A>(batch {}, A {}, val0, val1, static_cast<value_type>(vals)...))
+    {
+        static_assert(sizeof...(Ts) + 2 == size, "as many arguments as batch elements");
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>::batch(batch_bool_type const& b) noexcept
+        : m_real(b)
+        , m_imag(0)
+    {
+    }
+
+    template <class T, class A>
+    template <class U>
+    XSIMD_NO_DISCARD inline batch<std::complex<T>, A> batch<std::complex<T>, A>::broadcast(U val) noexcept
+    {
+        return batch(static_cast<std::complex<T>>(val));
+    }
+
+    /***********************************
+     * batch<complex> memory operators *
+     ***********************************/
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load_aligned(const T* real_src, const T* imag_src) noexcept
+    {
+        return { batch<T, A>::load_aligned(real_src), imag_src ? batch<T, A>::load_aligned(imag_src) : batch<T, A>(0) };
+    }
+    template <class T, class A>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load_unaligned(const T* real_src, const T* imag_src) noexcept
+    {
+        return { batch<T, A>::load_unaligned(real_src), imag_src ? batch<T, A>::load_unaligned(imag_src) : batch<T, A>(0) };
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load_aligned(const value_type* src) noexcept
+    {
+        assert(((reinterpret_cast<uintptr_t>(src) % A::alignment()) == 0)
+               && "loaded pointer is not properly aligned");
+        return kernel::load_complex_aligned<A>(src, kernel::convert<value_type> {}, A {});
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load_unaligned(const value_type* src) noexcept
+    {
+        return kernel::load_complex_unaligned<A>(src, kernel::convert<value_type> {}, A {});
+    }
+
+    template <class T, class A>
+    inline void batch<std::complex<T>, A>::store_aligned(value_type* dst) const noexcept
+    {
+        assert(((reinterpret_cast<uintptr_t>(dst) % A::alignment()) == 0)
+               && "store location is not properly aligned");
+        return kernel::store_complex_aligned(dst, *this, A {});
+    }
+
+    template <class T, class A>
+    inline void batch<std::complex<T>, A>::store_unaligned(value_type* dst) const noexcept
+    {
+        return kernel::store_complex_unaligned(dst, *this, A {});
+    }
+
+    template <class T, class A>
+    inline void batch<std::complex<T>, A>::store_aligned(T* real_dst, T* imag_dst) const noexcept
+    {
+        m_real.store_aligned(real_dst);
+        m_imag.store_aligned(imag_dst);
+    }
+
+    template <class T, class A>
+    inline void batch<std::complex<T>, A>::store_unaligned(T* real_dst, T* imag_dst) const noexcept
+    {
+        m_real.store_unaligned(real_dst);
+        m_imag.store_unaligned(imag_dst);
+    }
+
+    template <class T, class A>
+    template <class U>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load(U const* mem, aligned_mode) noexcept
+    {
+        return load_aligned(mem);
+    }
+
+    template <class T, class A>
+    template <class U>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load(U const* mem, unaligned_mode) noexcept
+    {
+        return load_unaligned(mem);
+    }
+
+    template <class T, class A>
+    template <class U>
+    inline void batch<std::complex<T>, A>::store(U* mem, aligned_mode) const noexcept
+    {
+        return store_aligned(mem);
+    }
+
+    template <class T, class A>
+    template <class U>
+    inline void batch<std::complex<T>, A>::store(U* mem, unaligned_mode) const noexcept
+    {
+        return store_unaligned(mem);
+    }
+
+    template <class T, class A>
+    inline auto batch<std::complex<T>, A>::real() const noexcept -> real_batch
+    {
+        return m_real;
+    }
+
+    template <class T, class A>
+    inline auto batch<std::complex<T>, A>::imag() const noexcept -> real_batch
+    {
+        return m_imag;
+    }
+
+    template <class T, class A>
+    inline auto batch<std::complex<T>, A>::get(std::size_t i) const noexcept -> value_type
+    {
+        return kernel::get(*this, i, A {});
+    }
+
+    /**************************************
+     * batch<complex> xtl-related methods *
+     **************************************/
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+
+    template <class T, class A>
+    template <bool i3ec>
+    inline batch<std::complex<T>, A>::batch(xtl::xcomplex<T, T, i3ec> const& val) noexcept
+        : m_real(val.real())
+        , m_imag(val.imag())
+    {
+    }
+
+    template <class T, class A>
+    template <bool i3ec, class... Ts>
+    inline batch<std::complex<T>, A>::batch(xtl::xcomplex<T, T, i3ec> val0, xtl::xcomplex<T, T, i3ec> val1, Ts... vals) noexcept
+        : batch(kernel::set<A>(batch {}, A {}, val0, val1, static_cast<xtl::xcomplex<T, T, i3ec>>(vals)...))
+    {
+        static_assert(sizeof...(Ts) + 2 == size, "as many arguments as batch elements");
+    }
+
+    // Memory layout of an xcomplex and std::complex are the same when xcomplex
+    // stores values and not reference. Unfortunately, this breaks strict
+    // aliasing...
+
+    template <class T, class A>
+    template <bool i3ec>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load_aligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept
+    {
+        return load_aligned(reinterpret_cast<std::complex<T> const*>(src));
+    }
+
+    template <class T, class A>
+    template <bool i3ec>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load_unaligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept
+    {
+        return load_unaligned(reinterpret_cast<std::complex<T> const*>(src));
+    }
+
+    template <class T, class A>
+    template <bool i3ec>
+    inline void batch<std::complex<T>, A>::store_aligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept
+    {
+        store_aligned(reinterpret_cast<std::complex<T>*>(dst));
+    }
+
+    template <class T, class A>
+    template <bool i3ec>
+    inline void batch<std::complex<T>, A>::store_unaligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept
+    {
+        store_unaligned(reinterpret_cast<std::complex<T>*>(dst));
+    }
+
+#endif
+
+    /***************************************
+     * batch<complex> comparison operators *
+     ***************************************/
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch<std::complex<T>, A>::operator==(batch const& other) const noexcept
+    {
+        return m_real == other.m_real && m_imag == other.m_imag;
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch<std::complex<T>, A>::operator!=(batch const& other) const noexcept
+    {
+        return m_real != other.m_real || m_imag != other.m_imag;
+    }
+
+    /***********************************
+     * batch<complex> update operators *
+     ***********************************/
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator+=(batch const& other) noexcept
+    {
+        m_real += other.m_real;
+        m_imag += other.m_imag;
+        return *this;
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator-=(batch const& other) noexcept
+    {
+        m_real -= other.m_real;
+        m_imag -= other.m_imag;
+        return *this;
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator*=(batch const& other) noexcept
+    {
+        real_batch new_real = real() * other.real() - imag() * other.imag();
+        real_batch new_imag = real() * other.imag() + imag() * other.real();
+        m_real = new_real;
+        m_imag = new_imag;
+        return *this;
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator/=(batch const& other) noexcept
+    {
+        real_batch a = real();
+        real_batch b = imag();
+        real_batch c = other.real();
+        real_batch d = other.imag();
+        real_batch e = c * c + d * d;
+        m_real = (c * a + d * b) / e;
+        m_imag = (c * b - d * a) / e;
+        return *this;
+    }
+
+    /**************************************
+     * batch<complex> incr/decr operators *
+     **************************************/
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator++() noexcept
+    {
+        return operator+=(1);
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator--() noexcept
+    {
+        return operator-=(1);
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::operator++(int) noexcept
+    {
+        batch copy(*this);
+        operator+=(1);
+        return copy;
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::operator--(int) noexcept
+    {
+        batch copy(*this);
+        operator-=(1);
+        return copy;
+    }
+
+    /**********************************
+     * batch<complex> unary operators *
+     **********************************/
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch<std::complex<T>, A>::operator!() const noexcept
+    {
+        return operator==(batch(0));
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::operator~() const noexcept
+    {
+        return { ~m_real, ~m_imag };
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::operator-() const noexcept
+    {
+        return { -m_real, -m_imag };
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::operator+() const noexcept
+    {
+        return { +m_real, +m_imag };
+    }
+
+    /**********************************
+     * size type aliases
+     **********************************/
+
+    namespace details
+    {
+        template <typename T, std::size_t N, class ArchList>
+        struct sized_batch;
+
+        template <typename T, std::size_t N>
+        struct sized_batch<T, N, xsimd::arch_list<>>
+        {
+            using type = void;
+        };
+
+        template <typename T, class Arch, bool BatchExists = xsimd::types::has_simd_register<T, Arch>::value>
+        struct batch_trait;
+
+        template <typename T, class Arch>
+        struct batch_trait<T, Arch, true>
+        {
+            using type = xsimd::batch<T, Arch>;
+            static constexpr std::size_t size = xsimd::batch<T, Arch>::size;
+        };
+
+        template <typename T, class Arch>
+        struct batch_trait<T, Arch, false>
+        {
+            using type = void;
+            static constexpr std::size_t size = 0;
+        };
+
+        template <typename T, std::size_t N, class Arch, class... Archs>
+        struct sized_batch<T, N, xsimd::arch_list<Arch, Archs...>>
+        {
+            using type = typename std::conditional<
+                batch_trait<T, Arch>::size == N,
+                typename batch_trait<T, Arch>::type,
+                typename sized_batch<T, N, xsimd::arch_list<Archs...>>::type>::type;
+        };
+    }
+
+    /**
+     * @brief type utility to select a batch of given type and size
+     *
+     * If one of the available architectures has a native vector type of the
+     * given type and size, sets the @p type member to the appropriate batch
+     * type. Otherwise set its to @p void.
+     *
+     * @tparam T the type of the underlying values.
+     * @tparam N the number of elements of that type in the batch.
+     **/
+    template <typename T, std::size_t N>
+    struct make_sized_batch
+    {
+        using type = typename details::sized_batch<T, N, supported_architectures>::type;
+    };
+
+    template <typename T, std::size_t N>
+    using make_sized_batch_t = typename make_sized_batch<T, N>::type;
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp b/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp
new file mode 100644
index 0000000000..897f5d889a
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp
@@ -0,0 +1,147 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_BATCH_CONSTANT_HPP
+#define XSIMD_BATCH_CONSTANT_HPP
+
+#include "./xsimd_batch.hpp"
+#include "./xsimd_utils.hpp"
+
+namespace xsimd
+{
+    /**
+     * @brief batch of boolean constant
+     *
+     * Abstract representation of a batch of boolean constants.
+     *
+     * @tparam batch_type the type of the associated batch values.
+     * @tparam Values boolean constant represented by this batch
+     **/
+    template <class batch_type, bool... Values>
+    struct batch_bool_constant
+    {
+        static constexpr std::size_t size = sizeof...(Values);
+        using arch_type = typename batch_type::arch_type;
+        using value_type = bool;
+        static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
+
+        operator batch_bool<typename batch_type::value_type, arch_type>() const noexcept { return { Values... }; }
+
+        bool get(size_t i) const noexcept
+        {
+            return std::array<value_type, size> { { Values... } }[i];
+        }
+
+        static constexpr int mask() noexcept
+        {
+            return mask_helper(0, static_cast<int>(Values)...);
+        }
+
+    private:
+        static constexpr int mask_helper(int acc) noexcept { return acc; }
+        template <class... Tys>
+        static constexpr int mask_helper(int acc, int mask, Tys... masks) noexcept
+        {
+            return mask_helper(acc | mask, (masks << 1)...);
+        }
+    };
+
+    /**
+     * @brief batch of integral constants
+     *
+     * Abstract representation of a batch of integral constants.
+     *
+     * @tparam batch_type the type of the associated batch values.
+     * @tparam Values constants represented by this batch
+     **/
+    template <class batch_type, typename batch_type::value_type... Values>
+    struct batch_constant
+    {
+        static constexpr std::size_t size = sizeof...(Values);
+        using arch_type = typename batch_type::arch_type;
+        using value_type = typename batch_type::value_type;
+        static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
+
+        /**
+         * @brief Generate a batch of @p batch_type from this @p batch_constant
+         */
+        operator batch_type() const noexcept { return { Values... }; }
+
+        /**
+         * @brief Get the @p i th element of this @p batch_constant
+         */
+        constexpr value_type get(size_t i) const noexcept
+        {
+            return get(i, std::array<value_type, size> { Values... });
+        }
+
+    private:
+        constexpr value_type get(size_t i, std::array<value_type, size> const& values) const noexcept
+        {
+            return values[i];
+        }
+    };
+
+    namespace detail
+    {
+        template <class batch_type, class G, std::size_t... Is>
+        inline constexpr auto make_batch_constant(detail::index_sequence<Is...>) noexcept
+            -> batch_constant<batch_type, (typename batch_type::value_type)G::get(Is, sizeof...(Is))...>
+        {
+            return {};
+        }
+        template <class batch_type, class G, std::size_t... Is>
+        inline constexpr auto make_batch_bool_constant(detail::index_sequence<Is...>) noexcept
+            -> batch_bool_constant<batch_type, G::get(Is, sizeof...(Is))...>
+        {
+            return {};
+        }
+
+    } // namespace detail
+
+    /**
+     * @brief Build a @c batch_constant out of a generator function
+     *
+     * @tparam batch_type type of the (non-constant) batch to build
+     * @tparam G type used to generate that batch. That type must have a static
+     * member @c get that's used to generate the batch constant. Conversely, the
+     * generated batch_constant has value `{G::get(0, batch_size), ... , G::get(batch_size - 1, batch_size)}`
+     *
+     * The following generator produces a batch of `(n - 1, 0, 1, ... n-2)`
+     *
+     * @code
+     * struct Rot
+     * {
+     *     static constexpr unsigned get(unsigned i, unsigned n)
+     *     {
+     *         return (i + n - 1) % n;
+     *     }
+     * };
+     * @endcode
+     */
+    template <class batch_type, class G>
+    inline constexpr auto make_batch_constant() noexcept -> decltype(detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>()))
+    {
+        return detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>());
+    }
+
+    template <class batch_type, class G>
+    inline constexpr auto make_batch_bool_constant() noexcept
+        -> decltype(detail::make_batch_bool_constant<batch_type, G>(
+            detail::make_index_sequence<batch_type::size>()))
+    {
+        return detail::make_batch_bool_constant<batch_type, G>(
+            detail::make_index_sequence<batch_type::size>());
+    }
+
+} // namespace xsimd
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp
new file mode 100644
index 0000000000..26ab9ed051
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp
@@ -0,0 +1,46 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_AVX2_REGISTER_HPP
+#define XSIMD_FMA3_AVX2_REGISTER_HPP
+
+#include "./xsimd_avx2_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct fma3;
+
+    /**
+     * @ingroup arch
+     *
+     * AVX2 + FMA instructions
+     */
+    template <>
+    struct fma3<avx2> : avx2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX2; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(2, 2, 1); }
+        static constexpr char const* name() noexcept { return "fma3+avx2"; }
+    };
+
+#if XSIMD_WITH_FMA3_AVX2
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<avx2>, avx2);
+
+    }
+#endif
+
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp
new file mode 100644
index 0000000000..676322efea
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp
@@ -0,0 +1,46 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_AVX_REGISTER_HPP
+#define XSIMD_FMA3_AVX_REGISTER_HPP
+
+#include "./xsimd_avx_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct fma3;
+
+    /**
+     * @ingroup arch
+     *
+     * AVX + FMA instructions
+     */
+    template <>
+    struct fma3<avx> : avx
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(2, 1, 1); }
+        static constexpr char const* name() noexcept { return "fma3+avx"; }
+    };
+
+#if XSIMD_WITH_FMA3_AVX
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<avx>, avx);
+
+    }
+#endif
+
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp
new file mode 100644
index 0000000000..ad7cf9f1bd
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp
@@ -0,0 +1,46 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_SSE_REGISTER_HPP
+#define XSIMD_FMA3_SSE_REGISTER_HPP
+
+#include "./xsimd_sse4_2_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct fma3;
+
+    /**
+     * @ingroup arch
+     *
+     * SSE4.2 + FMA instructions
+     */
+    template <>
+    struct fma3<sse4_2> : sse4_2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_SSE; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 4, 3); }
+        static constexpr char const* name() noexcept { return "fma3+sse4.2"; }
+    };
+
+#if XSIMD_WITH_FMA3_SSE
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<sse4_2>, sse4_2);
+
+    }
+#endif
+
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_fma4_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_fma4_register.hpp
new file mode 100644
index 0000000000..26b4af706e
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma4_register.hpp
@@ -0,0 +1,42 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA4_REGISTER_HPP
+#define XSIMD_FMA4_REGISTER_HPP
+
+#include "./xsimd_sse4_2_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup arch
+     *
+     * FMA4 instructions
+     */
+    struct fma4 : sse4_2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA4; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 4, 4); }
+        static constexpr char const* name() noexcept { return "fma4"; }
+    };
+
+#if XSIMD_WITH_FMA4
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma4, sse4_2);
+
+    }
+#endif
+
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp b/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp
new file mode 100644
index 0000000000..18c12a0beb
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp
@@ -0,0 +1,35 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_ARCH_HPP
+#define XSIMD_GENERIC_ARCH_HPP
+
+#include "../config/xsimd_config.hpp"
+
+/**
+ * @defgroup arch Architecture description
+ * */
+namespace xsimd
+{
+    struct generic
+    {
+        static constexpr bool supported() noexcept { return true; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr std::size_t alignment() noexcept { return 0; }
+        static constexpr bool requires_alignment() noexcept { return false; }
+        static constexpr unsigned version() noexcept { return generic::version(0, 0, 0); }
+
+    protected:
+        static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch) noexcept { return major * 10000u + minor * 100u + patch; }
+    };
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_neon64_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_neon64_register.hpp
new file mode 100644
index 0000000000..64a159dc9c
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_neon64_register.hpp
@@ -0,0 +1,52 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NEON64_REGISTER_HPP
+#define XSIMD_NEON64_REGISTER_HPP
+
+#include "xsimd_neon_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup arch
+     *
+     * NEON instructions for arm64
+     */
+    struct neon64 : neon
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_NEON64; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr unsigned version() noexcept { return generic::version(8, 1, 0); }
+        static constexpr char const* name() noexcept { return "arm64+neon"; }
+    };
+
+#if XSIMD_WITH_NEON64
+
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(neon64, neon);
+        XSIMD_DECLARE_SIMD_REGISTER(double, neon64, float64x2_t);
+
+        template <class T>
+        struct get_bool_simd_register<T, neon64>
+            : detail::neon_bool_simd_register<T, neon64>
+        {
+        };
+    }
+
+#endif
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_neon_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_neon_register.hpp
new file mode 100644
index 0000000000..570fe98f07
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_neon_register.hpp
@@ -0,0 +1,155 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NEON_REGISTER_HPP
+#define XSIMD_NEON_REGISTER_HPP
+
+#include "xsimd_generic_arch.hpp"
+#include "xsimd_register.hpp"
+
+#if XSIMD_WITH_NEON
+#include <arm_neon.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup arch
+     *
+     * NEON instructions for arm32
+     */
+    struct neon : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_NEON; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr unsigned version() noexcept { return generic::version(7, 0, 0); }
+        static constexpr char const* name() noexcept { return "arm32+neon"; }
+    };
+
+#if XSIMD_WITH_NEON
+    namespace types
+    {
+        namespace detail
+        {
+            template <size_t S>
+            struct neon_vector_type_impl;
+
+            template <>
+            struct neon_vector_type_impl<8>
+            {
+                using signed_type = int8x16_t;
+                using unsigned_type = uint8x16_t;
+            };
+
+            template <>
+            struct neon_vector_type_impl<16>
+            {
+                using signed_type = int16x8_t;
+                using unsigned_type = uint16x8_t;
+            };
+
+            template <>
+            struct neon_vector_type_impl<32>
+            {
+                using signed_type = int32x4_t;
+                using unsigned_type = uint32x4_t;
+            };
+
+            template <>
+            struct neon_vector_type_impl<64>
+            {
+                using signed_type = int64x2_t;
+                using unsigned_type = uint64x2_t;
+            };
+
+            template <class T>
+            using signed_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::signed_type;
+
+            template <class T>
+            using unsigned_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::unsigned_type;
+
+            template <class T>
+            using neon_vector_type = typename std::conditional<std::is_signed<T>::value,
+                                                               signed_neon_vector_type<T>,
+                                                               unsigned_neon_vector_type<T>>::type;
+
+            using char_neon_vector_type = typename std::conditional<std::is_signed<char>::value,
+                                                                    signed_neon_vector_type<char>,
+                                                                    unsigned_neon_vector_type<char>>::type;
+        }
+
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, neon, detail::neon_vector_type<signed char>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, neon, detail::neon_vector_type<unsigned char>);
+        XSIMD_DECLARE_SIMD_REGISTER(char, neon, detail::char_neon_vector_type);
+        XSIMD_DECLARE_SIMD_REGISTER(short, neon, detail::neon_vector_type<short>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, neon, detail::neon_vector_type<unsigned short>);
+        XSIMD_DECLARE_SIMD_REGISTER(int, neon, detail::neon_vector_type<int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, neon, detail::neon_vector_type<unsigned int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, neon, detail::neon_vector_type<long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, neon, detail::neon_vector_type<unsigned long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, neon, detail::neon_vector_type<long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, neon, detail::neon_vector_type<unsigned long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(float, neon, float32x4_t);
+        XSIMD_DECLARE_INVALID_SIMD_REGISTER(double, neon);
+
+        namespace detail
+        {
+            template <size_t S>
+            struct get_unsigned_type;
+
+            template <>
+            struct get_unsigned_type<1>
+            {
+                using type = uint8_t;
+            };
+
+            template <>
+            struct get_unsigned_type<2>
+            {
+                using type = uint16_t;
+            };
+
+            template <>
+            struct get_unsigned_type<4>
+            {
+                using type = uint32_t;
+            };
+
+            template <>
+            struct get_unsigned_type<8>
+            {
+                using type = uint64_t;
+            };
+
+            template <size_t S>
+            using get_unsigned_type_t = typename get_unsigned_type<S>::type;
+
+            template <class T, class A>
+            struct neon_bool_simd_register
+            {
+                using type = simd_register<get_unsigned_type_t<sizeof(T)>, A>;
+            };
+        }
+
+        template <class T>
+        struct get_bool_simd_register<T, neon>
+            : detail::neon_bool_simd_register<T, neon>
+        {
+        };
+
+    }
+#endif
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_register.hpp
new file mode 100644
index 0000000000..52d8c6a9ff
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_register.hpp
@@ -0,0 +1,94 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_REGISTER_HPP
+#define XSIMD_REGISTER_HPP
+
+#include <type_traits>
+
+namespace xsimd
+{
+    namespace types
+    {
+        template <class T, class A>
+        struct has_simd_register : std::false_type
+        {
+        };
+
+        template <class T, class Arch>
+        struct simd_register
+        {
+            struct register_type
+            {
+            };
+        };
+
+#define XSIMD_DECLARE_SIMD_REGISTER(SCALAR_TYPE, ISA, VECTOR_TYPE) \
+    template <>                                                    \
+    struct simd_register<SCALAR_TYPE, ISA>                         \
+    {                                                              \
+        using register_type = VECTOR_TYPE;                         \
+        register_type data;                                        \
+        operator register_type() const noexcept                    \
+        {                                                          \
+            return data;                                           \
+        }                                                          \
+    };                                                             \
+    template <>                                                    \
+    struct has_simd_register<SCALAR_TYPE, ISA> : std::true_type    \
+    {                                                              \
+    }
+
+#define XSIMD_DECLARE_INVALID_SIMD_REGISTER(SCALAR_TYPE, ISA)    \
+    template <>                                                  \
+    struct has_simd_register<SCALAR_TYPE, ISA> : std::false_type \
+    {                                                            \
+    }
+
+#define XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ISA, ISA_BASE)                          \
+    template <class T>                                                            \
+    struct simd_register<T, ISA> : simd_register<T, ISA_BASE>                     \
+    {                                                                             \
+        using register_type = typename simd_register<T, ISA_BASE>::register_type; \
+        simd_register(register_type reg) noexcept                                 \
+            : simd_register<T, ISA_BASE> { reg }                                  \
+        {                                                                         \
+        }                                                                         \
+        simd_register() = default;                                                \
+    };                                                                            \
+    template <class T>                                                            \
+    struct has_simd_register<T, ISA> : has_simd_register<T, ISA_BASE>             \
+    {                                                                             \
+    }
+
+        template <class T, class Arch>
+        struct get_bool_simd_register
+        {
+            using type = simd_register<T, Arch>;
+        };
+
+        template <class T, class Arch>
+        using get_bool_simd_register_t = typename get_bool_simd_register<T, Arch>::type;
+    }
+
+    namespace kernel
+    {
+        template <class A>
+        // makes requires_arch equal to A const&, using type_traits functions
+        using requires_arch = typename std::add_lvalue_reference<typename std::add_const<A>::type>::type;
+        template <class T>
+        struct convert
+        {
+        };
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp
new file mode 100644
index 0000000000..60aab86897
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp
@@ -0,0 +1,61 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE2_REGISTER_HPP
+#define XSIMD_SSE2_REGISTER_HPP
+
+#include "./xsimd_generic_arch.hpp"
+#include "./xsimd_register.hpp"
+
+#if XSIMD_WITH_SSE2
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup arch
+     *
+     * SSE2 instructions
+     */
+    struct sse2 : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE2; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 2, 0); }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr char const* name() noexcept { return "sse2"; }
+    };
+
+#if XSIMD_WITH_SSE2
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER(bool, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(char, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(short, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(float, sse2, __m128);
+        XSIMD_DECLARE_SIMD_REGISTER(double, sse2, __m128d);
+    }
+#endif
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_sse3_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_sse3_register.hpp
new file mode 100644
index 0000000000..d8dec5ae4f
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse3_register.hpp
@@ -0,0 +1,45 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE3_REGISTER_HPP
+#define XSIMD_SSE3_REGISTER_HPP
+
+#include "./xsimd_sse2_register.hpp"
+
+#if XSIMD_WITH_SSE3
+#include <pmmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup arch
+     *
+     * SSE3 instructions
+     */
+    struct sse3 : sse2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE3; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 3, 0); }
+        static constexpr char const* name() noexcept { return "sse3"; }
+    };
+
+#if XSIMD_WITH_SSE3
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse3, sse2);
+    }
+#endif
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp
new file mode 100644
index 0000000000..2cf0085b01
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp
@@ -0,0 +1,44 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE4_1_REGISTER_HPP
+#define XSIMD_SSE4_1_REGISTER_HPP
+
+#include "./xsimd_ssse3_register.hpp"
+
+#if XSIMD_WITH_SSE4_1
+#include <smmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup arch
+     *
+     * SSE4.1 instructions
+     */
+    struct sse4_1 : ssse3
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_1; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 4, 1); }
+        static constexpr char const* name() noexcept { return "sse4.1"; }
+    };
+
+#if XSIMD_WITH_SSE4_1
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse4_1, ssse3);
+    }
+#endif
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp
new file mode 100644
index 0000000000..10c2fe23cc
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp
@@ -0,0 +1,44 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE4_2_REGISTER_HPP
+#define XSIMD_SSE4_2_REGISTER_HPP
+
+#include "./xsimd_sse4_1_register.hpp"
+
+#if XSIMD_WITH_SSE4_2
+#include <nmmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup arch
+     *
+     * SSE4.2 instructions
+     */
+    struct sse4_2 : sse4_1
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_2; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 4, 2); }
+        static constexpr char const* name() noexcept { return "sse4.2"; }
+    };
+
+#if XSIMD_WITH_SSE4_2
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse4_2, sse4_1);
+    }
+#endif
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp
new file mode 100644
index 0000000000..0f70633bb8
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp
@@ -0,0 +1,44 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSSE3_REGISTER_HPP
+#define XSIMD_SSSE3_REGISTER_HPP
+
+#include "./xsimd_sse3_register.hpp"
+
+#if XSIMD_WITH_SSSE3
+#include <tmmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup arch
+     *
+     * SSSE3 instructions
+     */
+    struct ssse3 : sse3
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSSE3; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 3, 1); }
+        static constexpr char const* name() noexcept { return "ssse3"; }
+    };
+
+#if XSIMD_WITH_SSSE3
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ssse3, sse3);
+    }
+#endif
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp
new file mode 100644
index 0000000000..c8a532ada5
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp
@@ -0,0 +1,155 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Yibo Cai                                                   *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SVE_REGISTER_HPP
+#define XSIMD_SVE_REGISTER_HPP
+
+#include "xsimd_generic_arch.hpp"
+#include "xsimd_register.hpp"
+
+#if XSIMD_WITH_SVE
+#include <arm_sve.h>
+#endif
+
+namespace xsimd
+{
+    namespace detail
+    {
+        /**
+         * @ingroup arch
+         *
+         * SVE instructions (fixed vector size) for arm64
+         */
+        template <size_t Width>
+        struct sve : xsimd::generic
+        {
+            static constexpr bool supported() noexcept { return Width == XSIMD_SVE_BITS; }
+            static constexpr bool available() noexcept { return true; }
+            static constexpr bool requires_alignment() noexcept { return true; }
+            static constexpr std::size_t alignment() noexcept { return 16; }
+            static constexpr unsigned version() noexcept { return generic::version(9, 0, 0); }
+            static constexpr char const* name() noexcept { return "arm64+sve"; }
+        };
+    }
+
+#if XSIMD_WITH_SVE
+
+    using sve = detail::sve<__ARM_FEATURE_SVE_BITS>;
+
+    namespace types
+    {
+        namespace detail
+        {
+// define fixed size alias per SVE sizeless type
+#define SVE_TO_FIXED_SIZE(ty) ty __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)))
+            using sve_int8_t = SVE_TO_FIXED_SIZE(svint8_t);
+            using sve_uint8_t = SVE_TO_FIXED_SIZE(svuint8_t);
+            using sve_int16_t = SVE_TO_FIXED_SIZE(svint16_t);
+            using sve_uint16_t = SVE_TO_FIXED_SIZE(svuint16_t);
+            using sve_int32_t = SVE_TO_FIXED_SIZE(svint32_t);
+            using sve_uint32_t = SVE_TO_FIXED_SIZE(svuint32_t);
+            using sve_int64_t = SVE_TO_FIXED_SIZE(svint64_t);
+            using sve_uint64_t = SVE_TO_FIXED_SIZE(svuint64_t);
+            using sve_float32_t = SVE_TO_FIXED_SIZE(svfloat32_t);
+            using sve_float64_t = SVE_TO_FIXED_SIZE(svfloat64_t);
+            using sve_bool_t = SVE_TO_FIXED_SIZE(svbool_t);
+#undef SVE_TO_FIXED_SIZE
+
+            template <size_t S>
+            struct sve_vector_type_impl;
+
+            template <>
+            struct sve_vector_type_impl<8>
+            {
+                using signed_type = sve_int8_t;
+                using unsigned_type = sve_uint8_t;
+                using floating_point_type = void;
+            };
+
+            template <>
+            struct sve_vector_type_impl<16>
+            {
+                using signed_type = sve_int16_t;
+                using unsigned_type = sve_uint16_t;
+                using floating_point_type = void;
+            };
+
+            template <>
+            struct sve_vector_type_impl<32>
+            {
+                using signed_type = sve_int32_t;
+                using unsigned_type = sve_uint32_t;
+                using floating_point_type = sve_float32_t;
+            };
+
+            template <>
+            struct sve_vector_type_impl<64>
+            {
+                using signed_type = sve_int64_t;
+                using unsigned_type = sve_uint64_t;
+                using floating_point_type = sve_float64_t;
+            };
+
+            template <class T>
+            using signed_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::signed_type;
+
+            template <class T>
+            using unsigned_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::unsigned_type;
+
+            template <class T>
+            using floating_point_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::floating_point_type;
+
+            template <class T>
+            using signed_int_or_floating_point_sve_vector_type = typename std::conditional<std::is_floating_point<T>::value,
+                                                                                           floating_point_sve_vector_type<T>,
+                                                                                           signed_int_sve_vector_type<T>>::type;
+
+            template <class T>
+            using sve_vector_type = typename std::conditional<std::is_signed<T>::value,
+                                                              signed_int_or_floating_point_sve_vector_type<T>,
+                                                              unsigned_int_sve_vector_type<T>>::type;
+        } // namespace detail
+
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, sve, detail::sve_vector_type<signed char>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sve, detail::sve_vector_type<unsigned char>);
+        XSIMD_DECLARE_SIMD_REGISTER(char, sve, detail::sve_vector_type<char>);
+        XSIMD_DECLARE_SIMD_REGISTER(short, sve, detail::sve_vector_type<short>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, sve, detail::sve_vector_type<unsigned short>);
+        XSIMD_DECLARE_SIMD_REGISTER(int, sve, detail::sve_vector_type<int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, sve, detail::sve_vector_type<unsigned int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, sve, detail::sve_vector_type<long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, sve, detail::sve_vector_type<unsigned long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, sve, detail::sve_vector_type<long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, sve, detail::sve_vector_type<unsigned long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(float, sve, detail::sve_vector_type<float>);
+        XSIMD_DECLARE_SIMD_REGISTER(double, sve, detail::sve_vector_type<double>);
+
+        namespace detail
+        {
+            struct sve_bool_simd_register
+            {
+                using register_type = sve_bool_t;
+                register_type data;
+                operator register_type() const noexcept { return data; }
+            };
+        } // namespace detail
+
+        template <class T>
+        struct get_bool_simd_register<T, sve>
+        {
+            using type = detail::sve_bool_simd_register;
+        };
+    } // namespace types
+#endif
+} // namespace xsimd
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_traits.hpp b/third_party/xsimd/include/xsimd/types/xsimd_traits.hpp
new file mode 100644
index 0000000000..24a78cb5ad
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_traits.hpp
@@ -0,0 +1,319 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_TRAITS_HPP
+#define XSIMD_TRAITS_HPP
+
+#include <type_traits>
+
+#include "xsimd_batch.hpp"
+
+/**
+ * high level type traits
+ *
+ * @defgroup batch_traits Type traits
+ *
+ **/
+
+namespace xsimd
+{
+
+    /**************************************
+     * simd_traits and revert_simd_traits *
+     **************************************/
+
+    template <class T, class A = default_arch>
+    struct has_simd_register : types::has_simd_register<T, A>
+    {
+    };
+
+    namespace detail
+    {
+        template <class T, bool>
+        struct simd_traits_impl;
+
+        template <class T>
+        struct simd_traits_impl<T, false>
+        {
+            using type = T;
+            using bool_type = bool;
+            static constexpr size_t size = 1;
+        };
+
+        template <class T>
+        constexpr size_t simd_traits_impl<T, false>::size;
+
+        template <class T>
+        struct simd_traits_impl<T, true>
+        {
+            using type = batch<T>;
+            using bool_type = typename type::batch_bool_type;
+            static constexpr size_t size = type::size;
+        };
+
+        template <class T>
+        constexpr size_t simd_traits_impl<T, true>::size;
+
+        template <class T, class A>
+        struct static_check_supported_config_emitter
+        {
+
+            static_assert(A::supported(),
+                          "usage of batch type with unsupported architecture");
+            static_assert(!A::supported() || xsimd::has_simd_register<T, A>::value,
+                          "usage of batch type with unsupported type");
+        };
+
+        template <class T, class A>
+        struct static_check_supported_config_emitter<std::complex<T>, A> : static_check_supported_config_emitter<T, A>
+        {
+        };
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        template <class T, class A, bool i3ec>
+        struct static_check_supported_config_emitter<xtl::xcomplex<T, T, i3ec>, A> : static_check_supported_config_emitter<T, A>
+        {
+        };
+#endif
+
+        // consistency checker
+        template <class T, class A>
+        void static_check_supported_config()
+        {
+            (void)static_check_supported_config_emitter<T, A>();
+        }
+    }
+
+    template <class T>
+    struct simd_traits : detail::simd_traits_impl<T, xsimd::has_simd_register<T>::value>
+    {
+    };
+
+    template <class T>
+    struct simd_traits<std::complex<T>>
+        : detail::simd_traits_impl<std::complex<T>, xsimd::has_simd_register<T>::value>
+    {
+    };
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    struct simd_traits<xtl::xcomplex<T, T, i3ec>>
+        : detail::simd_traits_impl<std::complex<T>, xsimd::has_simd_register<T>::value>
+    {
+    };
+#endif
+
+    template <class T>
+    struct revert_simd_traits
+    {
+        using type = T;
+        static constexpr size_t size = simd_traits<type>::size;
+    };
+
+    template <class T>
+    constexpr size_t revert_simd_traits<T>::size;
+
+    template <class T>
+    struct revert_simd_traits<batch<T>>
+    {
+        using type = T;
+        static constexpr size_t size = batch<T>::size;
+    };
+
+    template <class T>
+    constexpr size_t revert_simd_traits<batch<T>>::size;
+
+    template <class T>
+    using simd_type = typename simd_traits<T>::type;
+
+    template <class T>
+    using simd_bool_type = typename simd_traits<T>::bool_type;
+
+    template <class T>
+    using revert_simd_type = typename revert_simd_traits<T>::type;
+
+    /********************
+     * simd_return_type *
+     ********************/
+
+    namespace detail
+    {
+        template <class T1, class T2>
+        struct simd_condition
+        {
+            static constexpr bool value = (std::is_same<T1, T2>::value && !std::is_same<T1, bool>::value) || (std::is_same<T1, bool>::value && !std::is_same<T2, bool>::value) || std::is_same<T1, float>::value || std::is_same<T1, double>::value || std::is_same<T1, int8_t>::value || std::is_same<T1, uint8_t>::value || std::is_same<T1, int16_t>::value || std::is_same<T1, uint16_t>::value || std::is_same<T1, int32_t>::value || std::is_same<T1, uint32_t>::value || std::is_same<T1, int64_t>::value || std::is_same<T1, uint64_t>::value || std::is_same<T1, char>::value || detail::is_complex<T1>::value;
+        };
+
+        template <class T1, class T2, class A>
+        struct simd_return_type_impl
+            : std::enable_if<simd_condition<T1, T2>::value, batch<T2, A>>
+        {
+        };
+
+        template <class T2, class A>
+        struct simd_return_type_impl<bool, T2, A>
+            : std::enable_if<simd_condition<bool, T2>::value, batch_bool<T2, A>>
+        {
+        };
+
+        template <class T2, class A>
+        struct simd_return_type_impl<bool, std::complex<T2>, A>
+            : std::enable_if<simd_condition<bool, T2>::value, batch_bool<T2, A>>
+        {
+        };
+
+        template <class T1, class T2, class A>
+        struct simd_return_type_impl<std::complex<T1>, T2, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+        template <class T1, class T2, class A>
+        struct simd_return_type_impl<std::complex<T1>, std::complex<T2>, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        template <class T1, class T2, bool I3EC, class A>
+        struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, T2, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+        template <class T1, class T2, bool I3EC, class A>
+        struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, std::complex<T2>, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+        template <class T1, class T2, bool I3EC, class A>
+        struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, xtl::xcomplex<T2, T2, I3EC>, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+        template <class T1, class T2, bool I3EC, class A>
+        struct simd_return_type_impl<std::complex<T1>, xtl::xcomplex<T2, T2, I3EC>, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+#endif
+    }
+
+    template <class T1, class T2, class A = default_arch>
+    using simd_return_type = typename detail::simd_return_type_impl<T1, T2, A>::type;
+
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits that inherits from @c std::true_type for @c batch<...> types and from
+     * @c std::false_type otherwise.
+     *
+     * @tparam T type to analyze.
+     */
+    template <class T>
+    struct is_batch;
+
+    template <class T>
+    struct is_batch : std::false_type
+    {
+    };
+
+    template <class T, class A>
+    struct is_batch<batch<T, A>> : std::true_type
+    {
+    };
+
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits that inherits from @c std::true_type for @c batch_bool<...> types and from
+     * @c std::false_type otherwise.
+     *
+     * @tparam T type to analyze.
+     */
+
+    template <class T>
+    struct is_batch_bool : std::false_type
+    {
+    };
+
+    template <class T, class A>
+    struct is_batch_bool<batch_bool<T, A>> : std::true_type
+    {
+    };
+
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits that inherits from @c std::true_type for @c batch<std::complex<...>>
+     * types and from @c std::false_type otherwise.
+     *
+     * @tparam T type to analyze.
+     */
+
+    template <class T>
+    struct is_batch_complex : std::false_type
+    {
+    };
+
+    template <class T, class A>
+    struct is_batch_complex<batch<std::complex<T>, A>> : std::true_type
+    {
+    };
+
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits whose @c type field is set to @c T::value_type if @c
+     * is_batch<T>::value and to @c T otherwise.
+     *
+     * @tparam T type to analyze.
+     */
+    template <class T>
+    struct scalar_type
+    {
+        using type = T;
+    };
+    template <class T, class A>
+    struct scalar_type<batch<T, A>>
+    {
+        using type = T;
+    };
+
+    template <class T>
+    using scalar_type_t = typename scalar_type<T>::type;
+
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits whose @c type field is set to @c T::value_type if @c
+     * is_batch_bool<T>::value and to @c bool otherwise.
+     *
+     * @tparam T type to analyze.
+     */
+    template <class T>
+    struct mask_type
+    {
+        using type = bool;
+    };
+    template <class T, class A>
+    struct mask_type<batch<T, A>>
+    {
+        using type = typename batch<T, A>::batch_bool_type;
+    };
+
+    template <class T>
+    using mask_type_t = typename mask_type<T>::type;
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_utils.hpp b/third_party/xsimd/include/xsimd/types/xsimd_utils.hpp
new file mode 100644
index 0000000000..aa890f2410
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_utils.hpp
@@ -0,0 +1,530 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_UTILS_HPP
+#define XSIMD_UTILS_HPP
+
+#include <complex>
+#include <cstdint>
+#include <cstring>
+#include <tuple>
+#include <type_traits>
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+#include "xtl/xcomplex.hpp"
+#endif
+
+namespace xsimd
+{
+
+    template <class T, class A>
+    class batch;
+
+    template <class T, class A>
+    class batch_bool;
+
+    /**************
+     * index      *
+     **************/
+
+    template <size_t I>
+    using index = std::integral_constant<size_t, I>;
+
+    /**************
+     * as_integer *
+     **************/
+
+    template <class T>
+    struct as_integer : std::make_signed<T>
+    {
+    };
+
+    template <>
+    struct as_integer<float>
+    {
+        using type = int32_t;
+    };
+
+    template <>
+    struct as_integer<double>
+    {
+        using type = int64_t;
+    };
+
+    template <class T, class A>
+    struct as_integer<batch<T, A>>
+    {
+        using type = batch<typename as_integer<T>::type, A>;
+    };
+
+    template <class B>
+    using as_integer_t = typename as_integer<B>::type;
+
+    /***********************
+     * as_unsigned_integer *
+     ***********************/
+
+    template <class T>
+    struct as_unsigned_integer : std::make_unsigned<T>
+    {
+    };
+
+    template <>
+    struct as_unsigned_integer<float>
+    {
+        using type = uint32_t;
+    };
+
+    template <>
+    struct as_unsigned_integer<double>
+    {
+        using type = uint64_t;
+    };
+
+    template <class T, class A>
+    struct as_unsigned_integer<batch<T, A>>
+    {
+        using type = batch<typename as_unsigned_integer<T>::type, A>;
+    };
+
+    template <class T>
+    using as_unsigned_integer_t = typename as_unsigned_integer<T>::type;
+
+    /*********************
+     * as_signed_integer *
+     *********************/
+
+    template <class T>
+    struct as_signed_integer : std::make_signed<T>
+    {
+    };
+
+    template <class T>
+    using as_signed_integer_t = typename as_signed_integer<T>::type;
+
+    /******************
+     * flip_sign_type *
+     ******************/
+
+    namespace detail
+    {
+        template <class T, bool is_signed>
+        struct flipped_sign_type_impl : std::make_signed<T>
+        {
+        };
+
+        template <class T>
+        struct flipped_sign_type_impl<T, true> : std::make_unsigned<T>
+        {
+        };
+    }
+
+    template <class T>
+    struct flipped_sign_type
+        : detail::flipped_sign_type_impl<T, std::is_signed<T>::value>
+    {
+    };
+
+    template <class T>
+    using flipped_sign_type_t = typename flipped_sign_type<T>::type;
+
+    /***********
+     * as_float *
+     ************/
+
+    template <class T>
+    struct as_float;
+
+    template <>
+    struct as_float<int32_t>
+    {
+        using type = float;
+    };
+
+    template <>
+    struct as_float<int64_t>
+    {
+        using type = double;
+    };
+
+    template <class T, class A>
+    struct as_float<batch<T, A>>
+    {
+        using type = batch<typename as_float<T>::type, A>;
+    };
+
+    template <class T>
+    using as_float_t = typename as_float<T>::type;
+
+    /**************
+     * as_logical *
+     **************/
+
+    template <class T>
+    struct as_logical;
+
+    template <class T, class A>
+    struct as_logical<batch<T, A>>
+    {
+        using type = batch_bool<T, A>;
+    };
+
+    template <class T>
+    using as_logical_t = typename as_logical<T>::type;
+
+    /********************
+     * bit_cast *
+     ********************/
+
+    template <class To, class From>
+    inline To bit_cast(From val) noexcept
+    {
+        static_assert(sizeof(From) == sizeof(To), "casting between compatible layout");
+        // FIXME: Some old version of GCC don't support that trait
+        // static_assert(std::is_trivially_copyable<From>::value, "input type is trivially copyable");
+        // static_assert(std::is_trivially_copyable<To>::value, "output type is trivially copyable");
+        To res;
+        std::memcpy(&res, &val, sizeof(val));
+        return res;
+    }
+
+    namespace kernel
+    {
+        namespace detail
+        {
+            /**************************************
+             * enabling / disabling metafunctions *
+             **************************************/
+
+            template <class T>
+            using enable_integral_t = typename std::enable_if<std::is_integral<T>::value, int>::type;
+
+            template <class T, size_t S>
+            using enable_sized_signed_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value && sizeof(T) == S, int>::type;
+
+            template <class T, size_t S>
+            using enable_sized_unsigned_t = typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value && sizeof(T) == S, int>::type;
+
+            template <class T, size_t S>
+            using enable_sized_integral_t = typename std::enable_if<std::is_integral<T>::value && sizeof(T) == S, int>::type;
+
+            template <class T, size_t S>
+            using enable_sized_t = typename std::enable_if<sizeof(T) == S, int>::type;
+
+            template <class T, size_t S>
+            using enable_max_sized_integral_t = typename std::enable_if<std::is_integral<T>::value && sizeof(T) <= S, int>::type;
+
+            /********************************
+             * Matching & mismatching sizes *
+             ********************************/
+
+            template <class T, class U, class B = int>
+            using sizes_match_t = typename std::enable_if<sizeof(T) == sizeof(U), B>::type;
+
+            template <class T, class U, class B = int>
+            using sizes_mismatch_t = typename std::enable_if<sizeof(T) != sizeof(U), B>::type;
+
+            template <class T, class U, class B = int>
+            using stride_match_t = typename std::enable_if<!std::is_same<T, U>::value && sizeof(T) == sizeof(U), B>::type;
+        } // namespace detail
+    } // namespace kernel
+
+    /*****************************************
+     * Backport of index_sequence from c++14 *
+     *****************************************/
+
+    // TODO: Remove this once we drop C++11 support
+    namespace detail
+    {
+        template <typename T>
+        struct identity
+        {
+            using type = T;
+        };
+
+#ifdef __cpp_lib_integer_sequence
+        using std::index_sequence;
+        using std::integer_sequence;
+        using std::make_index_sequence;
+        using std::make_integer_sequence;
+
+        using std::index_sequence_for;
+#else
+        template <typename T, T... Is>
+        struct integer_sequence
+        {
+            using value_type = T;
+            static constexpr std::size_t size() noexcept { return sizeof...(Is); }
+        };
+
+        template <typename Lhs, typename Rhs>
+        struct make_integer_sequence_concat;
+
+        template <typename T, T... Lhs, T... Rhs>
+        struct make_integer_sequence_concat<integer_sequence<T, Lhs...>,
+                                            integer_sequence<T, Rhs...>>
+            : identity<integer_sequence<T, Lhs..., (sizeof...(Lhs) + Rhs)...>>
+        {
+        };
+
+        template <typename T>
+        struct make_integer_sequence_impl;
+
+        template <typename T>
+        struct make_integer_sequence_impl<std::integral_constant<T, (T)0>> : identity<integer_sequence<T>>
+        {
+        };
+
+        template <typename T>
+        struct make_integer_sequence_impl<std::integral_constant<T, (T)1>> : identity<integer_sequence<T, 0>>
+        {
+        };
+
+        template <typename T, T N>
+        struct make_integer_sequence_impl<std::integral_constant<T, N>>
+            : make_integer_sequence_concat<typename make_integer_sequence_impl<std::integral_constant<T, N / 2>>::type,
+                                           typename make_integer_sequence_impl<std::integral_constant<T, N - (N / 2)>>::type>
+        {
+        };
+
+        template <typename T, T N>
+        using make_integer_sequence = typename make_integer_sequence_impl<std::integral_constant<T, N>>::type;
+
+        template <std::size_t... Is>
+        using index_sequence = integer_sequence<std::size_t, Is...>;
+
+        template <std::size_t N>
+        using make_index_sequence = make_integer_sequence<std::size_t, N>;
+
+        template <typename... Ts>
+        using index_sequence_for = make_index_sequence<sizeof...(Ts)>;
+
+#endif
+
+        template <int... Is>
+        using int_sequence = integer_sequence<int, Is...>;
+
+        template <int N>
+        using make_int_sequence = make_integer_sequence<int, N>;
+
+        template <typename... Ts>
+        using int_sequence_for = make_int_sequence<(int)sizeof...(Ts)>;
+
+        // Type-casted index sequence.
+        template <class P, size_t... Is>
+        inline P indexes_from(index_sequence<Is...>) noexcept
+        {
+            return { static_cast<typename P::value_type>(Is)... };
+        }
+
+        template <class P>
+        inline P make_sequence_as_batch() noexcept
+        {
+            return indexes_from<P>(make_index_sequence<P::size>());
+        }
+    }
+
+    /***********************************
+     * Backport of std::get from C++14 *
+     ***********************************/
+
+    namespace detail
+    {
+        template <class T, class... Types, size_t I, size_t... Is>
+        inline const T& get_impl(const std::tuple<Types...>& t, std::is_same<T, T>, index_sequence<I, Is...>) noexcept
+        {
+            return std::get<I>(t);
+        }
+
+        template <class T, class U, class... Types, size_t I, size_t... Is>
+        inline const T& get_impl(const std::tuple<Types...>& t, std::is_same<T, U>, index_sequence<I, Is...>) noexcept
+        {
+            using tuple_elem = typename std::tuple_element<I + 1, std::tuple<Types...>>::type;
+            return get_impl<T>(t, std::is_same<T, tuple_elem>(), index_sequence<Is...>());
+        }
+
+        template <class T, class... Types>
+        inline const T& get(const std::tuple<Types...>& t) noexcept
+        {
+            using tuple_elem = typename std::tuple_element<0, std::tuple<Types...>>::type;
+            return get_impl<T>(t, std::is_same<T, tuple_elem>(), make_index_sequence<sizeof...(Types)>());
+        }
+    }
+
+    /*********************************
+     * Backport of void_t from C++17 *
+     *********************************/
+
+    namespace detail
+    {
+        template <class... T>
+        struct make_void
+        {
+            using type = void;
+        };
+
+        template <class... T>
+        using void_t = typename make_void<T...>::type;
+    }
+
+    /**************************************************
+     * Equivalent of void_t but with size_t parameter *
+     **************************************************/
+
+    namespace detail
+    {
+        template <std::size_t>
+        struct check_size
+        {
+            using type = void;
+        };
+
+        template <std::size_t S>
+        using check_size_t = typename check_size<S>::type;
+    }
+
+    /*****************************************
+     * Supplementary std::array constructors *
+     *****************************************/
+
+    namespace detail
+    {
+        // std::array constructor from scalar value ("broadcast")
+        template <typename T, std::size_t... Is>
+        inline constexpr std::array<T, sizeof...(Is)>
+        array_from_scalar_impl(const T& scalar, index_sequence<Is...>) noexcept
+        {
+            // You can safely ignore this silly ternary, the "scalar" is all
+            // that matters. The rest is just a dirty workaround...
+            return std::array<T, sizeof...(Is)> { (Is + 1) ? scalar : T()... };
+        }
+
+        template <typename T, std::size_t N>
+        inline constexpr std::array<T, N>
+        array_from_scalar(const T& scalar) noexcept
+        {
+            return array_from_scalar_impl(scalar, make_index_sequence<N>());
+        }
+
+        // std::array constructor from C-style pointer (handled as an array)
+        template <typename T, std::size_t... Is>
+        inline constexpr std::array<T, sizeof...(Is)>
+        array_from_pointer_impl(const T* c_array, index_sequence<Is...>) noexcept
+        {
+            return std::array<T, sizeof...(Is)> { c_array[Is]... };
+        }
+
+        template <typename T, std::size_t N>
+        inline constexpr std::array<T, N>
+        array_from_pointer(const T* c_array) noexcept
+        {
+            return array_from_pointer_impl(c_array, make_index_sequence<N>());
+        }
+    }
+
+    /************************
+     * is_array_initializer *
+     ************************/
+
+    namespace detail
+    {
+        template <bool...>
+        struct bool_pack;
+
+        template <bool... bs>
+        using all_true = std::is_same<
+            bool_pack<bs..., true>, bool_pack<true, bs...>>;
+
+        template <typename T, typename... Args>
+        using is_all_convertible = all_true<std::is_convertible<Args, T>::value...>;
+
+        template <typename T, std::size_t N, typename... Args>
+        using is_array_initializer = std::enable_if<
+            (sizeof...(Args) == N) && is_all_convertible<T, Args...>::value>;
+
+        // Check that a variadic argument pack is a list of N values of type T,
+        // as usable for instantiating a value of type std::array<T, N>.
+        template <typename T, std::size_t N, typename... Args>
+        using is_array_initializer_t = typename is_array_initializer<T, N, Args...>::type;
+    }
+
+    /**************
+     * is_complex *
+     **************/
+
+    // This is used in both xsimd_complex_base.hpp and xsimd_traits.hpp
+    // However xsimd_traits.hpp indirectly includes xsimd_complex_base.hpp
+    // so we cannot define is_complex in xsimd_traits.hpp. Besides, if
+    // no file defining batches is included, we still need this definition
+    // in xsimd_traits.hpp, so let's define it here.
+
+    namespace detail
+    {
+        template <class T>
+        struct is_complex : std::false_type
+        {
+        };
+
+        template <class T>
+        struct is_complex<std::complex<T>> : std::true_type
+        {
+        };
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        template <class T, bool i3ec>
+        struct is_complex<xtl::xcomplex<T, T, i3ec>> : std::true_type
+        {
+        };
+#endif
+    }
+
+    /*******************
+     * real_batch_type *
+     *******************/
+
+    template <class B>
+    struct real_batch_type
+    {
+        using type = B;
+    };
+
+    template <class T, class A>
+    struct real_batch_type<batch<std::complex<T>, A>>
+    {
+        using type = batch<T, A>;
+    };
+
+    template <class B>
+    using real_batch_type_t = typename real_batch_type<B>::type;
+
+    /**********************
+     * complex_batch_type *
+     **********************/
+
+    template <class B>
+    struct complex_batch_type
+    {
+        using real_value_type = typename B::value_type;
+        using arch_type = typename B::arch_type;
+        using type = batch<std::complex<real_value_type>, arch_type>;
+    };
+
+    template <class T, class A>
+    struct complex_batch_type<batch<std::complex<T>, A>>
+    {
+        using type = batch<std::complex<T>, A>;
+    };
+
+    template <class B>
+    using complex_batch_type_t = typename complex_batch_type<B>::type;
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/xsimd.hpp b/third_party/xsimd/include/xsimd/xsimd.hpp
new file mode 100644
index 0000000000..8d76a5f91d
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/xsimd.hpp
@@ -0,0 +1,68 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_HPP
+#define XSIMD_HPP
+
+#if defined(__has_cpp_attribute)
+// if this check passes, then the compiler supports feature test macros
+#if __has_cpp_attribute(nodiscard) >= 201603L
+// if this check passes, then the compiler supports [[nodiscard]] without a message
+#define XSIMD_NO_DISCARD [[nodiscard]]
+#endif
+#endif
+
+#if !defined(XSIMD_NO_DISCARD) && __cplusplus >= 201703L
+// this means that the previous tests failed, but we are using C++17 or higher
+#define XSIMD_NO_DISCARD [[nodiscard]]
+#endif
+
+#if !defined(XSIMD_NO_DISCARD) && (defined(__GNUC__) || defined(__clang__))
+// this means that the previous checks failed, but we are using GCC or Clang
+#define XSIMD_NO_DISCARD __attribute__((warn_unused_result))
+#endif
+
+#if !defined(XSIMD_NO_DISCARD)
+// this means that all the previous checks failed, so we fallback to doing nothing
+#define XSIMD_NO_DISCARD
+#endif
+
+#ifdef __cpp_if_constexpr
+// this means that the compiler supports the `if constexpr` construct
+#define XSIMD_IF_CONSTEXPR if constexpr
+#endif
+
+#if !defined(XSIMD_IF_CONSTEXPR) && __cplusplus >= 201703L
+// this means that the previous test failed, but we are using C++17 or higher
+#define XSIMD_IF_CONSTEXPR if constexpr
+#endif
+
+#if !defined(XSIMD_IF_CONSTEXPR)
+// this means that all the previous checks failed, so we fallback to a normal `if`
+#define XSIMD_IF_CONSTEXPR if
+#endif
+
+#include "config/xsimd_config.hpp"
+
+#include "arch/xsimd_scalar.hpp"
+#include "memory/xsimd_aligned_allocator.hpp"
+
+#if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE)
+// to type definition or anything appart from scalar definition and aligned allocator
+#else
+#include "types/xsimd_batch.hpp"
+#include "types/xsimd_batch_constant.hpp"
+#include "types/xsimd_traits.hpp"
+
+// This include must come last
+#include "types/xsimd_api.hpp"
+#endif
+#endif
diff --git a/third_party/xsimd/moz.yaml b/third_party/xsimd/moz.yaml
new file mode 100644
index 0000000000..f868bb239b
--- /dev/null
+++ b/third_party/xsimd/moz.yaml
@@ -0,0 +1,37 @@
+schema: 1
+
+bugzilla:
+  product: Toolkit
+  component: "General"
+
+origin:
+  name: xsimd
+  description: C++ wrappers for SIMD intrinsics
+
+  url: https://github.com/QuantStack/xsimd
+
+  release: e8f209c3397c8a866be2312682689a04e4abfd66 (2023-02-27T06:32:46Z).
+  revision: e8f209c3397c8a866be2312682689a04e4abfd66
+
+  license: BSD-3-Clause
+
+vendoring:
+  url: https://github.com/QuantStack/xsimd
+  source-hosting: github
+  tracking: commit
+
+  exclude:
+    - ".*"
+    - "*.md"
+    - "*.yml"
+    - "*.txt"
+    - "*.in"
+    - "*.sh"
+    - benchmark
+    - cmake
+    - docs
+    - examples
+    - test
+
+  keep:
+    - include/
-- 
cgit v1.2.3